diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65844 @@ +{ + "best_global_step": 26200, + "best_metric": 0.08348097652196884, + "best_model_checkpoint": "saves/prompt-tuning/gemma-3-1b-it/train_sst2_1744902618/checkpoint-26200", + "epoch": 10.55694866041969, + "eval_steps": 200, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013197835554968984, + "grad_norm": 6.677094459533691, + "learning_rate": 0.29999999259779675, + "loss": 9.8917, + "num_input_tokens_seen": 4448, + "step": 5 + }, + { + "epoch": 0.002639567110993797, + "grad_norm": 3.5194380283355713, + "learning_rate": 0.29999996252634736, + "loss": 3.6302, + "num_input_tokens_seen": 9120, + "step": 10 + }, + { + "epoch": 0.0039593506664906955, + "grad_norm": 1.3258579969406128, + "learning_rate": 0.2999999093230187, + "loss": 1.3341, + "num_input_tokens_seen": 14304, + "step": 15 + }, + { + "epoch": 0.005279134221987594, + "grad_norm": 1.4635974168777466, + "learning_rate": 0.299999832987819, + "loss": 1.0645, + "num_input_tokens_seen": 18688, + "step": 20 + }, + { + "epoch": 0.006598917777484493, + "grad_norm": 0.6028760075569153, + "learning_rate": 0.29999973352076004, + "loss": 0.8198, + "num_input_tokens_seen": 23008, + "step": 25 + }, + { + "epoch": 0.007918701332981391, + "grad_norm": 0.3307127356529236, + "learning_rate": 0.2999996109218572, + "loss": 0.5397, + "num_input_tokens_seen": 27776, + "step": 30 + }, + { + "epoch": 0.00923848488847829, + "grad_norm": 0.07156379520893097, + "learning_rate": 0.2999994651911293, + "loss": 0.3823, + "num_input_tokens_seen": 32384, + "step": 35 + }, + { + "epoch": 0.010558268443975187, + "grad_norm": 0.09918271005153656, + "learning_rate": 0.2999992963285989, + "loss": 0.3483, + "num_input_tokens_seen": 37152, + "step": 40 + }, + { + "epoch": 0.011878051999472087, + "grad_norm": 0.07985903322696686, + "learning_rate": 0.29999910433429194, + "loss": 0.31, + "num_input_tokens_seen": 41568, + "step": 45 + }, + { + "epoch": 0.013197835554968985, + "grad_norm": 0.14092817902565002, + "learning_rate": 0.29999888920823814, + "loss": 0.4015, + "num_input_tokens_seen": 45920, + "step": 50 + }, + { + "epoch": 0.014517619110465884, + "grad_norm": 0.050208158791065216, + "learning_rate": 0.29999865095047057, + "loss": 0.3118, + "num_input_tokens_seen": 50432, + "step": 55 + }, + { + "epoch": 0.015837402665962782, + "grad_norm": 0.022126777097582817, + "learning_rate": 0.29999838956102604, + "loss": 0.2566, + "num_input_tokens_seen": 54688, + "step": 60 + }, + { + "epoch": 0.017157186221459682, + "grad_norm": 0.056358009576797485, + "learning_rate": 0.29999810503994484, + "loss": 0.2468, + "num_input_tokens_seen": 59040, + "step": 65 + }, + { + "epoch": 0.01847696977695658, + "grad_norm": 0.041189875453710556, + "learning_rate": 0.29999779738727084, + "loss": 0.2586, + "num_input_tokens_seen": 63936, + "step": 70 + }, + { + "epoch": 0.019796753332453478, + "grad_norm": 0.07035107910633087, + "learning_rate": 0.29999746660305154, + "loss": 0.257, + "num_input_tokens_seen": 68416, + "step": 75 + }, + { + "epoch": 0.021116536887950375, + "grad_norm": 0.05390488728880882, + "learning_rate": 0.2999971126873379, + "loss": 0.2534, + "num_input_tokens_seen": 72896, + "step": 80 + }, + { + "epoch": 0.022436320443447275, + "grad_norm": 0.07272368669509888, + "learning_rate": 0.2999967356401845, + "loss": 0.2651, + "num_input_tokens_seen": 77696, + "step": 85 + }, + { + "epoch": 0.023756103998944175, + "grad_norm": 0.020887505263090134, + "learning_rate": 0.29999633546164944, + "loss": 0.2481, + "num_input_tokens_seen": 82304, + "step": 90 + }, + { + "epoch": 0.02507588755444107, + "grad_norm": 0.022415101528167725, + "learning_rate": 0.29999591215179444, + "loss": 0.2251, + "num_input_tokens_seen": 86624, + "step": 95 + }, + { + "epoch": 0.02639567110993797, + "grad_norm": 0.015712114050984383, + "learning_rate": 0.2999954657106849, + "loss": 0.2197, + "num_input_tokens_seen": 90976, + "step": 100 + }, + { + "epoch": 0.027715454665434867, + "grad_norm": 0.04850278049707413, + "learning_rate": 0.2999949961383896, + "loss": 0.2395, + "num_input_tokens_seen": 95456, + "step": 105 + }, + { + "epoch": 0.029035238220931767, + "grad_norm": 0.017223581671714783, + "learning_rate": 0.2999945034349809, + "loss": 0.2604, + "num_input_tokens_seen": 99808, + "step": 110 + }, + { + "epoch": 0.030355021776428667, + "grad_norm": 0.03246023878455162, + "learning_rate": 0.2999939876005348, + "loss": 0.2327, + "num_input_tokens_seen": 104544, + "step": 115 + }, + { + "epoch": 0.031674805331925564, + "grad_norm": 0.03288329392671585, + "learning_rate": 0.29999344863513094, + "loss": 0.2482, + "num_input_tokens_seen": 109216, + "step": 120 + }, + { + "epoch": 0.03299458888742246, + "grad_norm": 0.012930463068187237, + "learning_rate": 0.2999928865388523, + "loss": 0.254, + "num_input_tokens_seen": 113760, + "step": 125 + }, + { + "epoch": 0.034314372442919364, + "grad_norm": 0.026351887732744217, + "learning_rate": 0.29999230131178567, + "loss": 0.2407, + "num_input_tokens_seen": 118144, + "step": 130 + }, + { + "epoch": 0.03563415599841626, + "grad_norm": 0.06502445042133331, + "learning_rate": 0.2999916929540212, + "loss": 0.2415, + "num_input_tokens_seen": 122528, + "step": 135 + }, + { + "epoch": 0.03695393955391316, + "grad_norm": 0.02442285418510437, + "learning_rate": 0.29999106146565285, + "loss": 0.2491, + "num_input_tokens_seen": 127168, + "step": 140 + }, + { + "epoch": 0.03827372310941006, + "grad_norm": 0.03321746364235878, + "learning_rate": 0.29999040684677786, + "loss": 0.2379, + "num_input_tokens_seen": 131552, + "step": 145 + }, + { + "epoch": 0.039593506664906956, + "grad_norm": 0.015723813325166702, + "learning_rate": 0.2999897290974972, + "loss": 0.2267, + "num_input_tokens_seen": 136000, + "step": 150 + }, + { + "epoch": 0.04091329022040385, + "grad_norm": 0.010787628591060638, + "learning_rate": 0.2999890282179155, + "loss": 0.2423, + "num_input_tokens_seen": 140256, + "step": 155 + }, + { + "epoch": 0.04223307377590075, + "grad_norm": 0.02265007235109806, + "learning_rate": 0.29998830420814077, + "loss": 0.2387, + "num_input_tokens_seen": 144928, + "step": 160 + }, + { + "epoch": 0.04355285733139765, + "grad_norm": 0.03658902645111084, + "learning_rate": 0.2999875570682846, + "loss": 0.2382, + "num_input_tokens_seen": 149088, + "step": 165 + }, + { + "epoch": 0.04487264088689455, + "grad_norm": 0.015315888449549675, + "learning_rate": 0.2999867867984623, + "loss": 0.2214, + "num_input_tokens_seen": 153440, + "step": 170 + }, + { + "epoch": 0.046192424442391446, + "grad_norm": 0.016718102619051933, + "learning_rate": 0.29998599339879267, + "loss": 0.2536, + "num_input_tokens_seen": 157824, + "step": 175 + }, + { + "epoch": 0.04751220799788835, + "grad_norm": 0.04029353708028793, + "learning_rate": 0.29998517686939796, + "loss": 0.2515, + "num_input_tokens_seen": 162208, + "step": 180 + }, + { + "epoch": 0.048831991553385246, + "grad_norm": 0.012116455473005772, + "learning_rate": 0.29998433721040413, + "loss": 0.2328, + "num_input_tokens_seen": 166688, + "step": 185 + }, + { + "epoch": 0.05015177510888214, + "grad_norm": 0.012083180248737335, + "learning_rate": 0.29998347442194073, + "loss": 0.2457, + "num_input_tokens_seen": 171328, + "step": 190 + }, + { + "epoch": 0.05147155866437904, + "grad_norm": 0.032272953540086746, + "learning_rate": 0.2999825885041407, + "loss": 0.2792, + "num_input_tokens_seen": 175776, + "step": 195 + }, + { + "epoch": 0.05279134221987594, + "grad_norm": 0.018532248213887215, + "learning_rate": 0.29998167945714077, + "loss": 0.2762, + "num_input_tokens_seen": 180224, + "step": 200 + }, + { + "epoch": 0.05279134221987594, + "eval_loss": 0.2763712704181671, + "eval_runtime": 75.6158, + "eval_samples_per_second": 89.069, + "eval_steps_per_second": 22.27, + "num_input_tokens_seen": 180224, + "step": 200 + }, + { + "epoch": 0.05411112577537284, + "grad_norm": 0.013408242724835873, + "learning_rate": 0.2999807472810811, + "loss": 0.2815, + "num_input_tokens_seen": 184704, + "step": 205 + }, + { + "epoch": 0.055430909330869735, + "grad_norm": 0.018121235072612762, + "learning_rate": 0.29997979197610536, + "loss": 0.2575, + "num_input_tokens_seen": 189504, + "step": 210 + }, + { + "epoch": 0.05675069288636664, + "grad_norm": 0.038669925183057785, + "learning_rate": 0.299978813542361, + "loss": 0.2312, + "num_input_tokens_seen": 194144, + "step": 215 + }, + { + "epoch": 0.058070476441863535, + "grad_norm": 0.014905461110174656, + "learning_rate": 0.2999778119799988, + "loss": 0.2314, + "num_input_tokens_seen": 198848, + "step": 220 + }, + { + "epoch": 0.05939025999736043, + "grad_norm": 0.02070033922791481, + "learning_rate": 0.29997678728917326, + "loss": 0.2476, + "num_input_tokens_seen": 203136, + "step": 225 + }, + { + "epoch": 0.060710043552857335, + "grad_norm": 0.013404873199760914, + "learning_rate": 0.2999757394700424, + "loss": 0.2287, + "num_input_tokens_seen": 207712, + "step": 230 + }, + { + "epoch": 0.06202982710835423, + "grad_norm": 0.020104754716157913, + "learning_rate": 0.29997466852276783, + "loss": 0.2598, + "num_input_tokens_seen": 212192, + "step": 235 + }, + { + "epoch": 0.06334961066385113, + "grad_norm": 0.022705990821123123, + "learning_rate": 0.29997357444751466, + "loss": 0.2437, + "num_input_tokens_seen": 216832, + "step": 240 + }, + { + "epoch": 0.06466939421934803, + "grad_norm": 0.026079440489411354, + "learning_rate": 0.2999724572444516, + "loss": 0.2432, + "num_input_tokens_seen": 221248, + "step": 245 + }, + { + "epoch": 0.06598917777484492, + "grad_norm": 0.01139963697642088, + "learning_rate": 0.29997131691375095, + "loss": 0.2402, + "num_input_tokens_seen": 225664, + "step": 250 + }, + { + "epoch": 0.06730896133034182, + "grad_norm": 0.02112664468586445, + "learning_rate": 0.2999701534555886, + "loss": 0.2334, + "num_input_tokens_seen": 230112, + "step": 255 + }, + { + "epoch": 0.06862874488583873, + "grad_norm": 0.009473930113017559, + "learning_rate": 0.2999689668701439, + "loss": 0.2369, + "num_input_tokens_seen": 234400, + "step": 260 + }, + { + "epoch": 0.06994852844133562, + "grad_norm": 0.011516750790178776, + "learning_rate": 0.29996775715759993, + "loss": 0.2429, + "num_input_tokens_seen": 239104, + "step": 265 + }, + { + "epoch": 0.07126831199683252, + "grad_norm": 0.009546388871967793, + "learning_rate": 0.2999665243181432, + "loss": 0.2379, + "num_input_tokens_seen": 243616, + "step": 270 + }, + { + "epoch": 0.07258809555232942, + "grad_norm": 0.013435663655400276, + "learning_rate": 0.2999652683519638, + "loss": 0.2348, + "num_input_tokens_seen": 248288, + "step": 275 + }, + { + "epoch": 0.07390787910782631, + "grad_norm": 0.015530126169323921, + "learning_rate": 0.29996398925925544, + "loss": 0.2483, + "num_input_tokens_seen": 252992, + "step": 280 + }, + { + "epoch": 0.07522766266332322, + "grad_norm": 0.010366507805883884, + "learning_rate": 0.2999626870402154, + "loss": 0.23, + "num_input_tokens_seen": 257472, + "step": 285 + }, + { + "epoch": 0.07654744621882012, + "grad_norm": 0.010465757921338081, + "learning_rate": 0.29996136169504445, + "loss": 0.2337, + "num_input_tokens_seen": 261952, + "step": 290 + }, + { + "epoch": 0.07786722977431701, + "grad_norm": 0.009162652306258678, + "learning_rate": 0.29996001322394694, + "loss": 0.2197, + "num_input_tokens_seen": 266432, + "step": 295 + }, + { + "epoch": 0.07918701332981391, + "grad_norm": 0.012247924692928791, + "learning_rate": 0.29995864162713093, + "loss": 0.2366, + "num_input_tokens_seen": 270912, + "step": 300 + }, + { + "epoch": 0.0805067968853108, + "grad_norm": 0.013994085602462292, + "learning_rate": 0.2999572469048079, + "loss": 0.2257, + "num_input_tokens_seen": 275328, + "step": 305 + }, + { + "epoch": 0.0818265804408077, + "grad_norm": 0.01257074810564518, + "learning_rate": 0.29995582905719287, + "loss": 0.2328, + "num_input_tokens_seen": 280000, + "step": 310 + }, + { + "epoch": 0.08314636399630461, + "grad_norm": 0.028065595775842667, + "learning_rate": 0.2999543880845046, + "loss": 0.2224, + "num_input_tokens_seen": 284384, + "step": 315 + }, + { + "epoch": 0.0844661475518015, + "grad_norm": 0.01452343724668026, + "learning_rate": 0.2999529239869652, + "loss": 0.2209, + "num_input_tokens_seen": 289088, + "step": 320 + }, + { + "epoch": 0.0857859311072984, + "grad_norm": 0.013722711242735386, + "learning_rate": 0.2999514367648005, + "loss": 0.246, + "num_input_tokens_seen": 293280, + "step": 325 + }, + { + "epoch": 0.0871057146627953, + "grad_norm": 0.017086632549762726, + "learning_rate": 0.29994992641823987, + "loss": 0.2351, + "num_input_tokens_seen": 297632, + "step": 330 + }, + { + "epoch": 0.0884254982182922, + "grad_norm": 0.023476189002394676, + "learning_rate": 0.29994839294751613, + "loss": 0.2421, + "num_input_tokens_seen": 302240, + "step": 335 + }, + { + "epoch": 0.0897452817737891, + "grad_norm": 0.013950608670711517, + "learning_rate": 0.29994683635286584, + "loss": 0.2129, + "num_input_tokens_seen": 306976, + "step": 340 + }, + { + "epoch": 0.091065065329286, + "grad_norm": 0.017007580026984215, + "learning_rate": 0.2999452566345291, + "loss": 0.2367, + "num_input_tokens_seen": 311456, + "step": 345 + }, + { + "epoch": 0.09238484888478289, + "grad_norm": 0.022419661283493042, + "learning_rate": 0.2999436537927494, + "loss": 0.2258, + "num_input_tokens_seen": 315968, + "step": 350 + }, + { + "epoch": 0.0937046324402798, + "grad_norm": 0.01511939987540245, + "learning_rate": 0.299942027827774, + "loss": 0.2426, + "num_input_tokens_seen": 320480, + "step": 355 + }, + { + "epoch": 0.0950244159957767, + "grad_norm": 0.03718345984816551, + "learning_rate": 0.29994037873985363, + "loss": 0.2387, + "num_input_tokens_seen": 325088, + "step": 360 + }, + { + "epoch": 0.09634419955127359, + "grad_norm": 0.01395063754171133, + "learning_rate": 0.29993870652924254, + "loss": 0.2252, + "num_input_tokens_seen": 329600, + "step": 365 + }, + { + "epoch": 0.09766398310677049, + "grad_norm": 0.014620993286371231, + "learning_rate": 0.29993701119619876, + "loss": 0.2136, + "num_input_tokens_seen": 334048, + "step": 370 + }, + { + "epoch": 0.0989837666622674, + "grad_norm": 0.0176034327596426, + "learning_rate": 0.2999352927409835, + "loss": 0.2334, + "num_input_tokens_seen": 338496, + "step": 375 + }, + { + "epoch": 0.10030355021776428, + "grad_norm": 0.014660129323601723, + "learning_rate": 0.29993355116386194, + "loss": 0.2226, + "num_input_tokens_seen": 343424, + "step": 380 + }, + { + "epoch": 0.10162333377326119, + "grad_norm": 0.013640029355883598, + "learning_rate": 0.29993178646510266, + "loss": 0.2033, + "num_input_tokens_seen": 347680, + "step": 385 + }, + { + "epoch": 0.10294311732875808, + "grad_norm": 0.03237594664096832, + "learning_rate": 0.2999299986449777, + "loss": 0.2401, + "num_input_tokens_seen": 352160, + "step": 390 + }, + { + "epoch": 0.10426290088425498, + "grad_norm": 0.020892232656478882, + "learning_rate": 0.29992818770376284, + "loss": 0.2449, + "num_input_tokens_seen": 356736, + "step": 395 + }, + { + "epoch": 0.10558268443975188, + "grad_norm": 0.01753915660083294, + "learning_rate": 0.29992635364173725, + "loss": 0.2356, + "num_input_tokens_seen": 361024, + "step": 400 + }, + { + "epoch": 0.10558268443975188, + "eval_loss": 0.22394408285617828, + "eval_runtime": 75.8113, + "eval_samples_per_second": 88.839, + "eval_steps_per_second": 22.213, + "num_input_tokens_seen": 361024, + "step": 400 + }, + { + "epoch": 0.10690246799524877, + "grad_norm": 0.014188665896654129, + "learning_rate": 0.2999244964591839, + "loss": 0.2179, + "num_input_tokens_seen": 365728, + "step": 405 + }, + { + "epoch": 0.10822225155074568, + "grad_norm": 0.01604193076491356, + "learning_rate": 0.2999226161563891, + "loss": 0.2235, + "num_input_tokens_seen": 370368, + "step": 410 + }, + { + "epoch": 0.10954203510624258, + "grad_norm": 0.020996225997805595, + "learning_rate": 0.2999207127336429, + "loss": 0.243, + "num_input_tokens_seen": 374976, + "step": 415 + }, + { + "epoch": 0.11086181866173947, + "grad_norm": 0.01242190320044756, + "learning_rate": 0.2999187861912387, + "loss": 0.1993, + "num_input_tokens_seen": 379424, + "step": 420 + }, + { + "epoch": 0.11218160221723637, + "grad_norm": 0.02116958424448967, + "learning_rate": 0.2999168365294737, + "loss": 0.2351, + "num_input_tokens_seen": 383968, + "step": 425 + }, + { + "epoch": 0.11350138577273328, + "grad_norm": 0.021505003795027733, + "learning_rate": 0.29991486374864856, + "loss": 0.23, + "num_input_tokens_seen": 388384, + "step": 430 + }, + { + "epoch": 0.11482116932823017, + "grad_norm": 0.013396223075687885, + "learning_rate": 0.29991286784906745, + "loss": 0.2245, + "num_input_tokens_seen": 392672, + "step": 435 + }, + { + "epoch": 0.11614095288372707, + "grad_norm": 0.026559041813015938, + "learning_rate": 0.2999108488310382, + "loss": 0.2324, + "num_input_tokens_seen": 397440, + "step": 440 + }, + { + "epoch": 0.11746073643922397, + "grad_norm": 0.019550181925296783, + "learning_rate": 0.29990880669487213, + "loss": 0.2323, + "num_input_tokens_seen": 401984, + "step": 445 + }, + { + "epoch": 0.11878051999472086, + "grad_norm": 0.02342870458960533, + "learning_rate": 0.29990674144088425, + "loss": 0.2359, + "num_input_tokens_seen": 406400, + "step": 450 + }, + { + "epoch": 0.12010030355021777, + "grad_norm": 0.016767160966992378, + "learning_rate": 0.299904653069393, + "loss": 0.222, + "num_input_tokens_seen": 410912, + "step": 455 + }, + { + "epoch": 0.12142008710571467, + "grad_norm": 0.03284667059779167, + "learning_rate": 0.29990254158072044, + "loss": 0.243, + "num_input_tokens_seen": 415360, + "step": 460 + }, + { + "epoch": 0.12273987066121156, + "grad_norm": 0.017589237540960312, + "learning_rate": 0.2999004069751921, + "loss": 0.2722, + "num_input_tokens_seen": 420032, + "step": 465 + }, + { + "epoch": 0.12405965421670846, + "grad_norm": 0.012029132805764675, + "learning_rate": 0.2998982492531373, + "loss": 0.2551, + "num_input_tokens_seen": 424640, + "step": 470 + }, + { + "epoch": 0.12537943777220537, + "grad_norm": 0.013526135124266148, + "learning_rate": 0.2998960684148887, + "loss": 0.2243, + "num_input_tokens_seen": 429248, + "step": 475 + }, + { + "epoch": 0.12669922132770225, + "grad_norm": 0.026649562641978264, + "learning_rate": 0.29989386446078264, + "loss": 0.2418, + "num_input_tokens_seen": 433792, + "step": 480 + }, + { + "epoch": 0.12801900488319914, + "grad_norm": 0.015574820339679718, + "learning_rate": 0.299891637391159, + "loss": 0.2216, + "num_input_tokens_seen": 438208, + "step": 485 + }, + { + "epoch": 0.12933878843869606, + "grad_norm": 0.01922217197716236, + "learning_rate": 0.2998893872063612, + "loss": 0.2279, + "num_input_tokens_seen": 442656, + "step": 490 + }, + { + "epoch": 0.13065857199419295, + "grad_norm": 0.016456065699458122, + "learning_rate": 0.2998871139067363, + "loss": 0.2241, + "num_input_tokens_seen": 446976, + "step": 495 + }, + { + "epoch": 0.13197835554968984, + "grad_norm": 0.03608671948313713, + "learning_rate": 0.2998848174926348, + "loss": 0.2085, + "num_input_tokens_seen": 451840, + "step": 500 + }, + { + "epoch": 0.13329813910518676, + "grad_norm": 0.03581022471189499, + "learning_rate": 0.2998824979644109, + "loss": 0.2437, + "num_input_tokens_seen": 456096, + "step": 505 + }, + { + "epoch": 0.13461792266068365, + "grad_norm": 0.029414229094982147, + "learning_rate": 0.29988015532242224, + "loss": 0.2459, + "num_input_tokens_seen": 460704, + "step": 510 + }, + { + "epoch": 0.13593770621618054, + "grad_norm": 0.019595574587583542, + "learning_rate": 0.29987778956703015, + "loss": 0.2136, + "num_input_tokens_seen": 465472, + "step": 515 + }, + { + "epoch": 0.13725748977167745, + "grad_norm": 0.0348958782851696, + "learning_rate": 0.2998754006985994, + "loss": 0.2176, + "num_input_tokens_seen": 469856, + "step": 520 + }, + { + "epoch": 0.13857727332717434, + "grad_norm": 0.027135366573929787, + "learning_rate": 0.29987298871749846, + "loss": 0.2271, + "num_input_tokens_seen": 474208, + "step": 525 + }, + { + "epoch": 0.13989705688267123, + "grad_norm": 0.018741633743047714, + "learning_rate": 0.2998705536240992, + "loss": 0.2004, + "num_input_tokens_seen": 478656, + "step": 530 + }, + { + "epoch": 0.14121684043816815, + "grad_norm": 0.020587505772709846, + "learning_rate": 0.2998680954187772, + "loss": 0.2233, + "num_input_tokens_seen": 483232, + "step": 535 + }, + { + "epoch": 0.14253662399366504, + "grad_norm": 0.01939532160758972, + "learning_rate": 0.2998656141019115, + "loss": 0.2102, + "num_input_tokens_seen": 487744, + "step": 540 + }, + { + "epoch": 0.14385640754916193, + "grad_norm": 0.029417065903544426, + "learning_rate": 0.2998631096738848, + "loss": 0.2233, + "num_input_tokens_seen": 492160, + "step": 545 + }, + { + "epoch": 0.14517619110465885, + "grad_norm": 0.01959785260260105, + "learning_rate": 0.29986058213508326, + "loss": 0.2121, + "num_input_tokens_seen": 496800, + "step": 550 + }, + { + "epoch": 0.14649597466015574, + "grad_norm": 0.023049186915159225, + "learning_rate": 0.29985803148589674, + "loss": 0.2148, + "num_input_tokens_seen": 501088, + "step": 555 + }, + { + "epoch": 0.14781575821565263, + "grad_norm": 0.025474144145846367, + "learning_rate": 0.2998554577267185, + "loss": 0.2464, + "num_input_tokens_seen": 505344, + "step": 560 + }, + { + "epoch": 0.14913554177114954, + "grad_norm": 0.04447808861732483, + "learning_rate": 0.2998528608579455, + "loss": 0.2445, + "num_input_tokens_seen": 510304, + "step": 565 + }, + { + "epoch": 0.15045532532664643, + "grad_norm": 0.053760968148708344, + "learning_rate": 0.2998502408799781, + "loss": 0.295, + "num_input_tokens_seen": 514752, + "step": 570 + }, + { + "epoch": 0.15177510888214332, + "grad_norm": 0.014392268843948841, + "learning_rate": 0.2998475977932205, + "loss": 0.2692, + "num_input_tokens_seen": 519424, + "step": 575 + }, + { + "epoch": 0.15309489243764024, + "grad_norm": 0.012679197825491428, + "learning_rate": 0.29984493159808023, + "loss": 0.2231, + "num_input_tokens_seen": 523840, + "step": 580 + }, + { + "epoch": 0.15441467599313713, + "grad_norm": 0.013995054177939892, + "learning_rate": 0.29984224229496836, + "loss": 0.2256, + "num_input_tokens_seen": 528352, + "step": 585 + }, + { + "epoch": 0.15573445954863402, + "grad_norm": 0.015209496021270752, + "learning_rate": 0.2998395298842998, + "loss": 0.2195, + "num_input_tokens_seen": 532832, + "step": 590 + }, + { + "epoch": 0.15705424310413094, + "grad_norm": 0.02589152753353119, + "learning_rate": 0.29983679436649263, + "loss": 0.2198, + "num_input_tokens_seen": 537088, + "step": 595 + }, + { + "epoch": 0.15837402665962783, + "grad_norm": 0.03899440914392471, + "learning_rate": 0.2998340357419689, + "loss": 0.224, + "num_input_tokens_seen": 541408, + "step": 600 + }, + { + "epoch": 0.15837402665962783, + "eval_loss": 0.2098138928413391, + "eval_runtime": 75.7269, + "eval_samples_per_second": 88.938, + "eval_steps_per_second": 22.238, + "num_input_tokens_seen": 541408, + "step": 600 + }, + { + "epoch": 0.15969381021512472, + "grad_norm": 0.02612273022532463, + "learning_rate": 0.29983125401115385, + "loss": 0.2271, + "num_input_tokens_seen": 545984, + "step": 605 + }, + { + "epoch": 0.1610135937706216, + "grad_norm": 0.02855265513062477, + "learning_rate": 0.29982844917447654, + "loss": 0.2225, + "num_input_tokens_seen": 550208, + "step": 610 + }, + { + "epoch": 0.16233337732611852, + "grad_norm": 0.03670830652117729, + "learning_rate": 0.2998256212323695, + "loss": 0.2386, + "num_input_tokens_seen": 554816, + "step": 615 + }, + { + "epoch": 0.1636531608816154, + "grad_norm": 0.03463875502347946, + "learning_rate": 0.29982277018526887, + "loss": 0.247, + "num_input_tokens_seen": 559328, + "step": 620 + }, + { + "epoch": 0.1649729444371123, + "grad_norm": 0.014423350803554058, + "learning_rate": 0.2998198960336143, + "loss": 0.1962, + "num_input_tokens_seen": 563872, + "step": 625 + }, + { + "epoch": 0.16629272799260922, + "grad_norm": 0.020694725215435028, + "learning_rate": 0.299816998777849, + "loss": 0.2234, + "num_input_tokens_seen": 568352, + "step": 630 + }, + { + "epoch": 0.1676125115481061, + "grad_norm": 0.02361847087740898, + "learning_rate": 0.2998140784184197, + "loss": 0.2234, + "num_input_tokens_seen": 572896, + "step": 635 + }, + { + "epoch": 0.168932295103603, + "grad_norm": 0.028091207146644592, + "learning_rate": 0.2998111349557769, + "loss": 0.2136, + "num_input_tokens_seen": 577376, + "step": 640 + }, + { + "epoch": 0.17025207865909991, + "grad_norm": 0.01473987940698862, + "learning_rate": 0.29980816839037444, + "loss": 0.1813, + "num_input_tokens_seen": 581888, + "step": 645 + }, + { + "epoch": 0.1715718622145968, + "grad_norm": 0.018099360167980194, + "learning_rate": 0.2998051787226698, + "loss": 0.1731, + "num_input_tokens_seen": 586656, + "step": 650 + }, + { + "epoch": 0.1728916457700937, + "grad_norm": 0.024372300133109093, + "learning_rate": 0.29980216595312403, + "loss": 0.2117, + "num_input_tokens_seen": 591200, + "step": 655 + }, + { + "epoch": 0.1742114293255906, + "grad_norm": 0.027297087013721466, + "learning_rate": 0.29979913008220177, + "loss": 0.2041, + "num_input_tokens_seen": 595456, + "step": 660 + }, + { + "epoch": 0.1755312128810875, + "grad_norm": 0.025453729555010796, + "learning_rate": 0.2997960711103711, + "loss": 0.186, + "num_input_tokens_seen": 599840, + "step": 665 + }, + { + "epoch": 0.1768509964365844, + "grad_norm": 0.019893085584044456, + "learning_rate": 0.29979298903810386, + "loss": 0.1689, + "num_input_tokens_seen": 604320, + "step": 670 + }, + { + "epoch": 0.1781707799920813, + "grad_norm": 0.033500079065561295, + "learning_rate": 0.29978988386587524, + "loss": 0.1871, + "num_input_tokens_seen": 608512, + "step": 675 + }, + { + "epoch": 0.1794905635475782, + "grad_norm": 0.016520898789167404, + "learning_rate": 0.2997867555941642, + "loss": 0.2276, + "num_input_tokens_seen": 613088, + "step": 680 + }, + { + "epoch": 0.1808103471030751, + "grad_norm": 0.022718854248523712, + "learning_rate": 0.299783604223453, + "loss": 0.1916, + "num_input_tokens_seen": 617760, + "step": 685 + }, + { + "epoch": 0.182130130658572, + "grad_norm": 0.029047425836324692, + "learning_rate": 0.29978042975422786, + "loss": 0.2224, + "num_input_tokens_seen": 622304, + "step": 690 + }, + { + "epoch": 0.1834499142140689, + "grad_norm": 0.030919566750526428, + "learning_rate": 0.29977723218697816, + "loss": 0.2201, + "num_input_tokens_seen": 626880, + "step": 695 + }, + { + "epoch": 0.18476969776956578, + "grad_norm": 0.024670401588082314, + "learning_rate": 0.299774011522197, + "loss": 0.243, + "num_input_tokens_seen": 631744, + "step": 700 + }, + { + "epoch": 0.1860894813250627, + "grad_norm": 0.033932507038116455, + "learning_rate": 0.29977076776038114, + "loss": 0.2607, + "num_input_tokens_seen": 636256, + "step": 705 + }, + { + "epoch": 0.1874092648805596, + "grad_norm": 0.01940137706696987, + "learning_rate": 0.2997675009020307, + "loss": 0.2351, + "num_input_tokens_seen": 640864, + "step": 710 + }, + { + "epoch": 0.18872904843605648, + "grad_norm": 0.017104987055063248, + "learning_rate": 0.2997642109476496, + "loss": 0.2121, + "num_input_tokens_seen": 645120, + "step": 715 + }, + { + "epoch": 0.1900488319915534, + "grad_norm": 0.02033950388431549, + "learning_rate": 0.299760897897745, + "loss": 0.205, + "num_input_tokens_seen": 649504, + "step": 720 + }, + { + "epoch": 0.19136861554705029, + "grad_norm": 0.014710627496242523, + "learning_rate": 0.29975756175282803, + "loss": 0.1925, + "num_input_tokens_seen": 654048, + "step": 725 + }, + { + "epoch": 0.19268839910254718, + "grad_norm": 0.0232036504894495, + "learning_rate": 0.29975420251341306, + "loss": 0.2041, + "num_input_tokens_seen": 658752, + "step": 730 + }, + { + "epoch": 0.1940081826580441, + "grad_norm": 0.02104211039841175, + "learning_rate": 0.29975082018001814, + "loss": 0.2219, + "num_input_tokens_seen": 663040, + "step": 735 + }, + { + "epoch": 0.19532796621354098, + "grad_norm": 0.01732596941292286, + "learning_rate": 0.2997474147531648, + "loss": 0.1998, + "num_input_tokens_seen": 667680, + "step": 740 + }, + { + "epoch": 0.19664774976903787, + "grad_norm": 0.020341604948043823, + "learning_rate": 0.29974398623337833, + "loss": 0.2172, + "num_input_tokens_seen": 672224, + "step": 745 + }, + { + "epoch": 0.1979675333245348, + "grad_norm": 0.02337329275906086, + "learning_rate": 0.2997405346211873, + "loss": 0.1739, + "num_input_tokens_seen": 676864, + "step": 750 + }, + { + "epoch": 0.19928731688003168, + "grad_norm": 0.020633738487958908, + "learning_rate": 0.2997370599171241, + "loss": 0.1731, + "num_input_tokens_seen": 681472, + "step": 755 + }, + { + "epoch": 0.20060710043552857, + "grad_norm": 0.02973725087940693, + "learning_rate": 0.2997335621217246, + "loss": 0.2229, + "num_input_tokens_seen": 686112, + "step": 760 + }, + { + "epoch": 0.20192688399102549, + "grad_norm": 0.013670760206878185, + "learning_rate": 0.29973004123552816, + "loss": 0.2576, + "num_input_tokens_seen": 690496, + "step": 765 + }, + { + "epoch": 0.20324666754652237, + "grad_norm": 0.015076623298227787, + "learning_rate": 0.2997264972590777, + "loss": 0.2317, + "num_input_tokens_seen": 694976, + "step": 770 + }, + { + "epoch": 0.20456645110201926, + "grad_norm": 0.013737809844315052, + "learning_rate": 0.29972293019291973, + "loss": 0.2406, + "num_input_tokens_seen": 699584, + "step": 775 + }, + { + "epoch": 0.20588623465751615, + "grad_norm": 0.01564852148294449, + "learning_rate": 0.2997193400376045, + "loss": 0.2391, + "num_input_tokens_seen": 704224, + "step": 780 + }, + { + "epoch": 0.20720601821301307, + "grad_norm": 0.0372004397213459, + "learning_rate": 0.2997157267936854, + "loss": 0.2231, + "num_input_tokens_seen": 708896, + "step": 785 + }, + { + "epoch": 0.20852580176850996, + "grad_norm": 0.038940802216529846, + "learning_rate": 0.2997120904617199, + "loss": 0.227, + "num_input_tokens_seen": 713344, + "step": 790 + }, + { + "epoch": 0.20984558532400685, + "grad_norm": 0.016891151666641235, + "learning_rate": 0.29970843104226863, + "loss": 0.1945, + "num_input_tokens_seen": 717920, + "step": 795 + }, + { + "epoch": 0.21116536887950377, + "grad_norm": 0.019245754927396774, + "learning_rate": 0.2997047485358959, + "loss": 0.202, + "num_input_tokens_seen": 722496, + "step": 800 + }, + { + "epoch": 0.21116536887950377, + "eval_loss": 0.18965022265911102, + "eval_runtime": 75.8609, + "eval_samples_per_second": 88.781, + "eval_steps_per_second": 22.199, + "num_input_tokens_seen": 722496, + "step": 800 + }, + { + "epoch": 0.21248515243500066, + "grad_norm": 0.019390780478715897, + "learning_rate": 0.2997010429431697, + "loss": 0.1897, + "num_input_tokens_seen": 727104, + "step": 805 + }, + { + "epoch": 0.21380493599049755, + "grad_norm": 0.020296314731240273, + "learning_rate": 0.29969731426466134, + "loss": 0.1804, + "num_input_tokens_seen": 731648, + "step": 810 + }, + { + "epoch": 0.21512471954599446, + "grad_norm": 0.031563650816679, + "learning_rate": 0.299693562500946, + "loss": 0.194, + "num_input_tokens_seen": 736000, + "step": 815 + }, + { + "epoch": 0.21644450310149135, + "grad_norm": 0.030905431136488914, + "learning_rate": 0.29968978765260207, + "loss": 0.1978, + "num_input_tokens_seen": 740672, + "step": 820 + }, + { + "epoch": 0.21776428665698824, + "grad_norm": 0.020552564412355423, + "learning_rate": 0.2996859897202118, + "loss": 0.1796, + "num_input_tokens_seen": 745088, + "step": 825 + }, + { + "epoch": 0.21908407021248516, + "grad_norm": 0.025276528671383858, + "learning_rate": 0.2996821687043609, + "loss": 0.1967, + "num_input_tokens_seen": 749728, + "step": 830 + }, + { + "epoch": 0.22040385376798205, + "grad_norm": 0.022461742162704468, + "learning_rate": 0.2996783246056384, + "loss": 0.2143, + "num_input_tokens_seen": 754144, + "step": 835 + }, + { + "epoch": 0.22172363732347894, + "grad_norm": 0.039467718452215195, + "learning_rate": 0.29967445742463744, + "loss": 0.2172, + "num_input_tokens_seen": 759008, + "step": 840 + }, + { + "epoch": 0.22304342087897586, + "grad_norm": 0.024768516421318054, + "learning_rate": 0.29967056716195417, + "loss": 0.1702, + "num_input_tokens_seen": 763392, + "step": 845 + }, + { + "epoch": 0.22436320443447275, + "grad_norm": 0.029736513271927834, + "learning_rate": 0.2996666538181885, + "loss": 0.2078, + "num_input_tokens_seen": 767936, + "step": 850 + }, + { + "epoch": 0.22568298798996964, + "grad_norm": 0.019354358315467834, + "learning_rate": 0.29966271739394407, + "loss": 0.1759, + "num_input_tokens_seen": 772384, + "step": 855 + }, + { + "epoch": 0.22700277154546655, + "grad_norm": 0.02574213407933712, + "learning_rate": 0.29965875788982776, + "loss": 0.1891, + "num_input_tokens_seen": 776864, + "step": 860 + }, + { + "epoch": 0.22832255510096344, + "grad_norm": 0.01876310259103775, + "learning_rate": 0.2996547753064503, + "loss": 0.1806, + "num_input_tokens_seen": 781536, + "step": 865 + }, + { + "epoch": 0.22964233865646033, + "grad_norm": 0.027809128165245056, + "learning_rate": 0.29965076964442583, + "loss": 0.1993, + "num_input_tokens_seen": 785856, + "step": 870 + }, + { + "epoch": 0.23096212221195725, + "grad_norm": 0.029446261003613472, + "learning_rate": 0.299646740904372, + "loss": 0.2168, + "num_input_tokens_seen": 790464, + "step": 875 + }, + { + "epoch": 0.23228190576745414, + "grad_norm": 0.024715347215533257, + "learning_rate": 0.29964268908691016, + "loss": 0.2204, + "num_input_tokens_seen": 794912, + "step": 880 + }, + { + "epoch": 0.23360168932295103, + "grad_norm": 0.017755035310983658, + "learning_rate": 0.29963861419266513, + "loss": 0.1759, + "num_input_tokens_seen": 799584, + "step": 885 + }, + { + "epoch": 0.23492147287844795, + "grad_norm": 0.02756551280617714, + "learning_rate": 0.29963451622226533, + "loss": 0.1785, + "num_input_tokens_seen": 804000, + "step": 890 + }, + { + "epoch": 0.23624125643394484, + "grad_norm": 0.019246403127908707, + "learning_rate": 0.29963039517634277, + "loss": 0.1902, + "num_input_tokens_seen": 808384, + "step": 895 + }, + { + "epoch": 0.23756103998944172, + "grad_norm": 0.012400196865200996, + "learning_rate": 0.2996262510555328, + "loss": 0.1825, + "num_input_tokens_seen": 813248, + "step": 900 + }, + { + "epoch": 0.23888082354493864, + "grad_norm": 0.014721804298460484, + "learning_rate": 0.2996220838604746, + "loss": 0.1826, + "num_input_tokens_seen": 817920, + "step": 905 + }, + { + "epoch": 0.24020060710043553, + "grad_norm": 0.01871349662542343, + "learning_rate": 0.29961789359181085, + "loss": 0.1657, + "num_input_tokens_seen": 822624, + "step": 910 + }, + { + "epoch": 0.24152039065593242, + "grad_norm": 0.04101996496319771, + "learning_rate": 0.29961368025018764, + "loss": 0.2164, + "num_input_tokens_seen": 826784, + "step": 915 + }, + { + "epoch": 0.24284017421142934, + "grad_norm": 0.02952665276825428, + "learning_rate": 0.2996094438362548, + "loss": 0.2413, + "num_input_tokens_seen": 831232, + "step": 920 + }, + { + "epoch": 0.24415995776692623, + "grad_norm": 0.009108531288802624, + "learning_rate": 0.2996051843506657, + "loss": 0.1675, + "num_input_tokens_seen": 835712, + "step": 925 + }, + { + "epoch": 0.24547974132242312, + "grad_norm": 0.014543977566063404, + "learning_rate": 0.299600901794077, + "loss": 0.1842, + "num_input_tokens_seen": 840128, + "step": 930 + }, + { + "epoch": 0.24679952487792003, + "grad_norm": 0.015053880400955677, + "learning_rate": 0.29959659616714923, + "loss": 0.1693, + "num_input_tokens_seen": 844704, + "step": 935 + }, + { + "epoch": 0.24811930843341692, + "grad_norm": 0.02488836646080017, + "learning_rate": 0.2995922674705464, + "loss": 0.1997, + "num_input_tokens_seen": 849504, + "step": 940 + }, + { + "epoch": 0.2494390919889138, + "grad_norm": 0.015438751317560673, + "learning_rate": 0.2995879157049361, + "loss": 0.1999, + "num_input_tokens_seen": 854016, + "step": 945 + }, + { + "epoch": 0.25075887554441073, + "grad_norm": 0.01926361955702305, + "learning_rate": 0.2995835408709893, + "loss": 0.2174, + "num_input_tokens_seen": 858400, + "step": 950 + }, + { + "epoch": 0.2520786590999076, + "grad_norm": 0.02791557088494301, + "learning_rate": 0.29957914296938076, + "loss": 0.2156, + "num_input_tokens_seen": 862912, + "step": 955 + }, + { + "epoch": 0.2533984426554045, + "grad_norm": 0.012956407852470875, + "learning_rate": 0.2995747220007886, + "loss": 0.1897, + "num_input_tokens_seen": 867680, + "step": 960 + }, + { + "epoch": 0.2547182262109014, + "grad_norm": 0.018951639533042908, + "learning_rate": 0.2995702779658947, + "loss": 0.2019, + "num_input_tokens_seen": 871968, + "step": 965 + }, + { + "epoch": 0.2560380097663983, + "grad_norm": 0.01760631985962391, + "learning_rate": 0.29956581086538425, + "loss": 0.1875, + "num_input_tokens_seen": 876480, + "step": 970 + }, + { + "epoch": 0.25735779332189523, + "grad_norm": 0.015180498361587524, + "learning_rate": 0.2995613206999462, + "loss": 0.1745, + "num_input_tokens_seen": 880992, + "step": 975 + }, + { + "epoch": 0.2586775768773921, + "grad_norm": 0.014722463674843311, + "learning_rate": 0.29955680747027297, + "loss": 0.1765, + "num_input_tokens_seen": 885728, + "step": 980 + }, + { + "epoch": 0.259997360432889, + "grad_norm": 0.019271686673164368, + "learning_rate": 0.2995522711770607, + "loss": 0.1577, + "num_input_tokens_seen": 890112, + "step": 985 + }, + { + "epoch": 0.2613171439883859, + "grad_norm": 0.016230188310146332, + "learning_rate": 0.2995477118210087, + "loss": 0.1729, + "num_input_tokens_seen": 894464, + "step": 990 + }, + { + "epoch": 0.2626369275438828, + "grad_norm": 0.013542043045163155, + "learning_rate": 0.29954312940282024, + "loss": 0.1723, + "num_input_tokens_seen": 898656, + "step": 995 + }, + { + "epoch": 0.2639567110993797, + "grad_norm": 0.013104903511703014, + "learning_rate": 0.29953852392320196, + "loss": 0.149, + "num_input_tokens_seen": 903200, + "step": 1000 + }, + { + "epoch": 0.2639567110993797, + "eval_loss": 0.1768295019865036, + "eval_runtime": 75.8257, + "eval_samples_per_second": 88.822, + "eval_steps_per_second": 22.209, + "num_input_tokens_seen": 903200, + "step": 1000 + }, + { + "epoch": 0.2652764946548766, + "grad_norm": 0.02251453511416912, + "learning_rate": 0.2995338953828641, + "loss": 0.1422, + "num_input_tokens_seen": 907680, + "step": 1005 + }, + { + "epoch": 0.2665962782103735, + "grad_norm": 0.024892162531614304, + "learning_rate": 0.2995292437825204, + "loss": 0.1713, + "num_input_tokens_seen": 912288, + "step": 1010 + }, + { + "epoch": 0.2679160617658704, + "grad_norm": 0.018052786588668823, + "learning_rate": 0.29952456912288816, + "loss": 0.1682, + "num_input_tokens_seen": 916800, + "step": 1015 + }, + { + "epoch": 0.2692358453213673, + "grad_norm": 0.01923326961696148, + "learning_rate": 0.2995198714046884, + "loss": 0.1639, + "num_input_tokens_seen": 920928, + "step": 1020 + }, + { + "epoch": 0.2705556288768642, + "grad_norm": 0.013904568739235401, + "learning_rate": 0.2995151506286454, + "loss": 0.1891, + "num_input_tokens_seen": 925312, + "step": 1025 + }, + { + "epoch": 0.2718754124323611, + "grad_norm": 0.018324246630072594, + "learning_rate": 0.2995104067954873, + "loss": 0.1889, + "num_input_tokens_seen": 930080, + "step": 1030 + }, + { + "epoch": 0.27319519598785796, + "grad_norm": 0.01762617938220501, + "learning_rate": 0.2995056399059456, + "loss": 0.1725, + "num_input_tokens_seen": 934656, + "step": 1035 + }, + { + "epoch": 0.2745149795433549, + "grad_norm": 0.012809723615646362, + "learning_rate": 0.2995008499607554, + "loss": 0.1783, + "num_input_tokens_seen": 938880, + "step": 1040 + }, + { + "epoch": 0.2758347630988518, + "grad_norm": 0.015525863505899906, + "learning_rate": 0.2994960369606554, + "loss": 0.1777, + "num_input_tokens_seen": 943360, + "step": 1045 + }, + { + "epoch": 0.2771545466543487, + "grad_norm": 0.02479635924100876, + "learning_rate": 0.2994912009063878, + "loss": 0.168, + "num_input_tokens_seen": 947968, + "step": 1050 + }, + { + "epoch": 0.2784743302098456, + "grad_norm": 0.012723193503916264, + "learning_rate": 0.29948634179869843, + "loss": 0.1738, + "num_input_tokens_seen": 952576, + "step": 1055 + }, + { + "epoch": 0.27979411376534247, + "grad_norm": 0.017463108524680138, + "learning_rate": 0.29948145963833656, + "loss": 0.195, + "num_input_tokens_seen": 957120, + "step": 1060 + }, + { + "epoch": 0.28111389732083936, + "grad_norm": 0.014511752873659134, + "learning_rate": 0.29947655442605514, + "loss": 0.1745, + "num_input_tokens_seen": 961184, + "step": 1065 + }, + { + "epoch": 0.2824336808763363, + "grad_norm": 0.019120479002594948, + "learning_rate": 0.2994716261626106, + "loss": 0.1907, + "num_input_tokens_seen": 965888, + "step": 1070 + }, + { + "epoch": 0.2837534644318332, + "grad_norm": 0.019143512472510338, + "learning_rate": 0.2994666748487629, + "loss": 0.1572, + "num_input_tokens_seen": 970496, + "step": 1075 + }, + { + "epoch": 0.2850732479873301, + "grad_norm": 0.013463502749800682, + "learning_rate": 0.2994617004852756, + "loss": 0.1725, + "num_input_tokens_seen": 975232, + "step": 1080 + }, + { + "epoch": 0.28639303154282697, + "grad_norm": 0.01735137216746807, + "learning_rate": 0.2994567030729159, + "loss": 0.153, + "num_input_tokens_seen": 979680, + "step": 1085 + }, + { + "epoch": 0.28771281509832386, + "grad_norm": 0.022447416558861732, + "learning_rate": 0.29945168261245436, + "loss": 0.189, + "num_input_tokens_seen": 984480, + "step": 1090 + }, + { + "epoch": 0.28903259865382075, + "grad_norm": 0.0232168547809124, + "learning_rate": 0.29944663910466524, + "loss": 0.1901, + "num_input_tokens_seen": 988928, + "step": 1095 + }, + { + "epoch": 0.2903523822093177, + "grad_norm": 0.01743777096271515, + "learning_rate": 0.2994415725503263, + "loss": 0.1994, + "num_input_tokens_seen": 993120, + "step": 1100 + }, + { + "epoch": 0.2916721657648146, + "grad_norm": 0.01917177066206932, + "learning_rate": 0.29943648295021885, + "loss": 0.214, + "num_input_tokens_seen": 997696, + "step": 1105 + }, + { + "epoch": 0.2929919493203115, + "grad_norm": 0.01948252134025097, + "learning_rate": 0.2994313703051278, + "loss": 0.1874, + "num_input_tokens_seen": 1002112, + "step": 1110 + }, + { + "epoch": 0.29431173287580836, + "grad_norm": 0.030179573222994804, + "learning_rate": 0.29942623461584156, + "loss": 0.2142, + "num_input_tokens_seen": 1006880, + "step": 1115 + }, + { + "epoch": 0.29563151643130525, + "grad_norm": 0.013341312296688557, + "learning_rate": 0.29942107588315214, + "loss": 0.1704, + "num_input_tokens_seen": 1011392, + "step": 1120 + }, + { + "epoch": 0.29695129998680214, + "grad_norm": 0.025752197951078415, + "learning_rate": 0.29941589410785513, + "loss": 0.2023, + "num_input_tokens_seen": 1015840, + "step": 1125 + }, + { + "epoch": 0.2982710835422991, + "grad_norm": 0.01646181382238865, + "learning_rate": 0.29941068929074954, + "loss": 0.1656, + "num_input_tokens_seen": 1020608, + "step": 1130 + }, + { + "epoch": 0.299590867097796, + "grad_norm": 0.013332548551261425, + "learning_rate": 0.2994054614326381, + "loss": 0.2256, + "num_input_tokens_seen": 1025088, + "step": 1135 + }, + { + "epoch": 0.30091065065329287, + "grad_norm": 0.0266672782599926, + "learning_rate": 0.29940021053432686, + "loss": 0.2407, + "num_input_tokens_seen": 1029664, + "step": 1140 + }, + { + "epoch": 0.30223043420878976, + "grad_norm": 0.0208321250975132, + "learning_rate": 0.29939493659662575, + "loss": 0.2103, + "num_input_tokens_seen": 1034016, + "step": 1145 + }, + { + "epoch": 0.30355021776428665, + "grad_norm": 0.00892278365790844, + "learning_rate": 0.299389639620348, + "loss": 0.1987, + "num_input_tokens_seen": 1038528, + "step": 1150 + }, + { + "epoch": 0.30487000131978353, + "grad_norm": 0.01982424594461918, + "learning_rate": 0.29938431960631046, + "loss": 0.1876, + "num_input_tokens_seen": 1043392, + "step": 1155 + }, + { + "epoch": 0.3061897848752805, + "grad_norm": 0.014881512150168419, + "learning_rate": 0.2993789765553335, + "loss": 0.2127, + "num_input_tokens_seen": 1048128, + "step": 1160 + }, + { + "epoch": 0.30750956843077737, + "grad_norm": 0.014781773090362549, + "learning_rate": 0.2993736104682412, + "loss": 0.1605, + "num_input_tokens_seen": 1053120, + "step": 1165 + }, + { + "epoch": 0.30882935198627426, + "grad_norm": 0.02260724827647209, + "learning_rate": 0.299368221345861, + "loss": 0.1361, + "num_input_tokens_seen": 1057728, + "step": 1170 + }, + { + "epoch": 0.31014913554177115, + "grad_norm": 0.014083570800721645, + "learning_rate": 0.29936280918902397, + "loss": 0.155, + "num_input_tokens_seen": 1062336, + "step": 1175 + }, + { + "epoch": 0.31146891909726804, + "grad_norm": 0.0165408868342638, + "learning_rate": 0.2993573739985648, + "loss": 0.1661, + "num_input_tokens_seen": 1066848, + "step": 1180 + }, + { + "epoch": 0.3127887026527649, + "grad_norm": 0.012539774179458618, + "learning_rate": 0.2993519157753216, + "loss": 0.1338, + "num_input_tokens_seen": 1071136, + "step": 1185 + }, + { + "epoch": 0.3141084862082619, + "grad_norm": 0.026923632249236107, + "learning_rate": 0.2993464345201361, + "loss": 0.1466, + "num_input_tokens_seen": 1075552, + "step": 1190 + }, + { + "epoch": 0.31542826976375876, + "grad_norm": 0.027983980253338814, + "learning_rate": 0.2993409302338536, + "loss": 0.1492, + "num_input_tokens_seen": 1080032, + "step": 1195 + }, + { + "epoch": 0.31674805331925565, + "grad_norm": 0.01721656695008278, + "learning_rate": 0.2993354029173229, + "loss": 0.1626, + "num_input_tokens_seen": 1084928, + "step": 1200 + }, + { + "epoch": 0.31674805331925565, + "eval_loss": 0.17083507776260376, + "eval_runtime": 75.9237, + "eval_samples_per_second": 88.707, + "eval_steps_per_second": 22.18, + "num_input_tokens_seen": 1084928, + "step": 1200 + }, + { + "epoch": 0.31806783687475254, + "grad_norm": 0.0188509002327919, + "learning_rate": 0.2993298525713965, + "loss": 0.1993, + "num_input_tokens_seen": 1089632, + "step": 1205 + }, + { + "epoch": 0.31938762043024943, + "grad_norm": 0.009793519042432308, + "learning_rate": 0.29932427919693017, + "loss": 0.1938, + "num_input_tokens_seen": 1093920, + "step": 1210 + }, + { + "epoch": 0.3207074039857463, + "grad_norm": 0.03133498504757881, + "learning_rate": 0.2993186827947834, + "loss": 0.2177, + "num_input_tokens_seen": 1098368, + "step": 1215 + }, + { + "epoch": 0.3220271875412432, + "grad_norm": 0.011872789822518826, + "learning_rate": 0.2993130633658194, + "loss": 0.1842, + "num_input_tokens_seen": 1102912, + "step": 1220 + }, + { + "epoch": 0.32334697109674015, + "grad_norm": 0.01801072061061859, + "learning_rate": 0.29930742091090456, + "loss": 0.1931, + "num_input_tokens_seen": 1107456, + "step": 1225 + }, + { + "epoch": 0.32466675465223704, + "grad_norm": 0.027939675375819206, + "learning_rate": 0.29930175543090914, + "loss": 0.1742, + "num_input_tokens_seen": 1111840, + "step": 1230 + }, + { + "epoch": 0.32598653820773393, + "grad_norm": 0.019021207466721535, + "learning_rate": 0.2992960669267068, + "loss": 0.177, + "num_input_tokens_seen": 1116224, + "step": 1235 + }, + { + "epoch": 0.3273063217632308, + "grad_norm": 0.008341898210346699, + "learning_rate": 0.29929035539917476, + "loss": 0.1478, + "num_input_tokens_seen": 1120800, + "step": 1240 + }, + { + "epoch": 0.3286261053187277, + "grad_norm": 0.018850918859243393, + "learning_rate": 0.2992846208491938, + "loss": 0.1729, + "num_input_tokens_seen": 1125280, + "step": 1245 + }, + { + "epoch": 0.3299458888742246, + "grad_norm": 0.009253122843801975, + "learning_rate": 0.2992788632776483, + "loss": 0.1188, + "num_input_tokens_seen": 1129600, + "step": 1250 + }, + { + "epoch": 0.33126567242972155, + "grad_norm": 0.023635873571038246, + "learning_rate": 0.29927308268542613, + "loss": 0.1954, + "num_input_tokens_seen": 1134112, + "step": 1255 + }, + { + "epoch": 0.33258545598521844, + "grad_norm": 0.0102853924036026, + "learning_rate": 0.2992672790734187, + "loss": 0.1582, + "num_input_tokens_seen": 1138592, + "step": 1260 + }, + { + "epoch": 0.3339052395407153, + "grad_norm": 0.01048312522470951, + "learning_rate": 0.299261452442521, + "loss": 0.1416, + "num_input_tokens_seen": 1143200, + "step": 1265 + }, + { + "epoch": 0.3352250230962122, + "grad_norm": 0.01267638523131609, + "learning_rate": 0.29925560279363167, + "loss": 0.126, + "num_input_tokens_seen": 1147840, + "step": 1270 + }, + { + "epoch": 0.3365448066517091, + "grad_norm": 0.00973203033208847, + "learning_rate": 0.29924973012765266, + "loss": 0.1458, + "num_input_tokens_seen": 1152128, + "step": 1275 + }, + { + "epoch": 0.337864590207206, + "grad_norm": 0.019228804856538773, + "learning_rate": 0.29924383444548974, + "loss": 0.1591, + "num_input_tokens_seen": 1156416, + "step": 1280 + }, + { + "epoch": 0.33918437376270294, + "grad_norm": 0.022717628628015518, + "learning_rate": 0.299237915748052, + "loss": 0.1372, + "num_input_tokens_seen": 1160800, + "step": 1285 + }, + { + "epoch": 0.34050415731819983, + "grad_norm": 0.027487007901072502, + "learning_rate": 0.2992319740362522, + "loss": 0.1951, + "num_input_tokens_seen": 1164960, + "step": 1290 + }, + { + "epoch": 0.3418239408736967, + "grad_norm": 0.021288776770234108, + "learning_rate": 0.2992260093110066, + "loss": 0.1982, + "num_input_tokens_seen": 1169600, + "step": 1295 + }, + { + "epoch": 0.3431437244291936, + "grad_norm": 0.01042261440306902, + "learning_rate": 0.2992200215732352, + "loss": 0.1494, + "num_input_tokens_seen": 1174144, + "step": 1300 + }, + { + "epoch": 0.3444635079846905, + "grad_norm": 0.01725694164633751, + "learning_rate": 0.2992140108238611, + "loss": 0.1749, + "num_input_tokens_seen": 1178720, + "step": 1305 + }, + { + "epoch": 0.3457832915401874, + "grad_norm": 0.015691304579377174, + "learning_rate": 0.2992079770638115, + "loss": 0.1682, + "num_input_tokens_seen": 1183008, + "step": 1310 + }, + { + "epoch": 0.34710307509568433, + "grad_norm": 0.01028724480420351, + "learning_rate": 0.29920192029401677, + "loss": 0.1383, + "num_input_tokens_seen": 1187392, + "step": 1315 + }, + { + "epoch": 0.3484228586511812, + "grad_norm": 0.019815417006611824, + "learning_rate": 0.2991958405154109, + "loss": 0.1293, + "num_input_tokens_seen": 1192032, + "step": 1320 + }, + { + "epoch": 0.3497426422066781, + "grad_norm": 0.020265651866793633, + "learning_rate": 0.29918973772893154, + "loss": 0.173, + "num_input_tokens_seen": 1196640, + "step": 1325 + }, + { + "epoch": 0.351062425762175, + "grad_norm": 0.01801508292555809, + "learning_rate": 0.29918361193551973, + "loss": 0.1814, + "num_input_tokens_seen": 1201248, + "step": 1330 + }, + { + "epoch": 0.3523822093176719, + "grad_norm": 0.03358190879225731, + "learning_rate": 0.29917746313612026, + "loss": 0.1985, + "num_input_tokens_seen": 1205856, + "step": 1335 + }, + { + "epoch": 0.3537019928731688, + "grad_norm": 0.01084743533283472, + "learning_rate": 0.29917129133168124, + "loss": 0.171, + "num_input_tokens_seen": 1210432, + "step": 1340 + }, + { + "epoch": 0.3550217764286657, + "grad_norm": 0.02676808089017868, + "learning_rate": 0.2991650965231546, + "loss": 0.1725, + "num_input_tokens_seen": 1215168, + "step": 1345 + }, + { + "epoch": 0.3563415599841626, + "grad_norm": 0.013283584266901016, + "learning_rate": 0.29915887871149544, + "loss": 0.1848, + "num_input_tokens_seen": 1220064, + "step": 1350 + }, + { + "epoch": 0.3576613435396595, + "grad_norm": 0.011968174949288368, + "learning_rate": 0.2991526378976628, + "loss": 0.1858, + "num_input_tokens_seen": 1224512, + "step": 1355 + }, + { + "epoch": 0.3589811270951564, + "grad_norm": 0.0152819212526083, + "learning_rate": 0.29914637408261896, + "loss": 0.1634, + "num_input_tokens_seen": 1229024, + "step": 1360 + }, + { + "epoch": 0.3603009106506533, + "grad_norm": 0.012122084386646748, + "learning_rate": 0.29914008726733, + "loss": 0.1511, + "num_input_tokens_seen": 1233472, + "step": 1365 + }, + { + "epoch": 0.3616206942061502, + "grad_norm": 0.014549699611961842, + "learning_rate": 0.2991337774527653, + "loss": 0.1797, + "num_input_tokens_seen": 1238304, + "step": 1370 + }, + { + "epoch": 0.36294047776164706, + "grad_norm": 0.011119179427623749, + "learning_rate": 0.2991274446398981, + "loss": 0.1444, + "num_input_tokens_seen": 1242784, + "step": 1375 + }, + { + "epoch": 0.364260261317144, + "grad_norm": 0.013916585594415665, + "learning_rate": 0.29912108882970484, + "loss": 0.137, + "num_input_tokens_seen": 1247264, + "step": 1380 + }, + { + "epoch": 0.3655800448726409, + "grad_norm": 0.0171501524746418, + "learning_rate": 0.2991147100231657, + "loss": 0.1762, + "num_input_tokens_seen": 1251712, + "step": 1385 + }, + { + "epoch": 0.3668998284281378, + "grad_norm": 0.020163768902420998, + "learning_rate": 0.2991083082212644, + "loss": 0.1714, + "num_input_tokens_seen": 1256416, + "step": 1390 + }, + { + "epoch": 0.3682196119836347, + "grad_norm": 0.011016067117452621, + "learning_rate": 0.2991018834249881, + "loss": 0.1504, + "num_input_tokens_seen": 1260640, + "step": 1395 + }, + { + "epoch": 0.36953939553913157, + "grad_norm": 0.011099683120846748, + "learning_rate": 0.29909543563532764, + "loss": 0.1431, + "num_input_tokens_seen": 1265312, + "step": 1400 + }, + { + "epoch": 0.36953939553913157, + "eval_loss": 0.15640906989574432, + "eval_runtime": 75.8347, + "eval_samples_per_second": 88.812, + "eval_steps_per_second": 22.206, + "num_input_tokens_seen": 1265312, + "step": 1400 + }, + { + "epoch": 0.37085917909462845, + "grad_norm": 0.014368798583745956, + "learning_rate": 0.29908896485327746, + "loss": 0.1618, + "num_input_tokens_seen": 1269824, + "step": 1405 + }, + { + "epoch": 0.3721789626501254, + "grad_norm": 0.011822053231298923, + "learning_rate": 0.29908247107983527, + "loss": 0.1638, + "num_input_tokens_seen": 1274592, + "step": 1410 + }, + { + "epoch": 0.3734987462056223, + "grad_norm": 0.012189829722046852, + "learning_rate": 0.29907595431600253, + "loss": 0.1731, + "num_input_tokens_seen": 1279072, + "step": 1415 + }, + { + "epoch": 0.3748185297611192, + "grad_norm": 0.016046881675720215, + "learning_rate": 0.29906941456278424, + "loss": 0.1679, + "num_input_tokens_seen": 1283488, + "step": 1420 + }, + { + "epoch": 0.37613831331661607, + "grad_norm": 0.020486503839492798, + "learning_rate": 0.2990628518211889, + "loss": 0.2028, + "num_input_tokens_seen": 1288160, + "step": 1425 + }, + { + "epoch": 0.37745809687211296, + "grad_norm": 0.015101040713489056, + "learning_rate": 0.2990562660922286, + "loss": 0.184, + "num_input_tokens_seen": 1292512, + "step": 1430 + }, + { + "epoch": 0.37877788042760985, + "grad_norm": 0.012599697336554527, + "learning_rate": 0.2990496573769189, + "loss": 0.182, + "num_input_tokens_seen": 1296736, + "step": 1435 + }, + { + "epoch": 0.3800976639831068, + "grad_norm": 0.011938369832932949, + "learning_rate": 0.29904302567627894, + "loss": 0.1458, + "num_input_tokens_seen": 1301056, + "step": 1440 + }, + { + "epoch": 0.3814174475386037, + "grad_norm": 0.00920266006141901, + "learning_rate": 0.2990363709913314, + "loss": 0.143, + "num_input_tokens_seen": 1305216, + "step": 1445 + }, + { + "epoch": 0.38273723109410057, + "grad_norm": 0.013366157189011574, + "learning_rate": 0.29902969332310264, + "loss": 0.1436, + "num_input_tokens_seen": 1309664, + "step": 1450 + }, + { + "epoch": 0.38405701464959746, + "grad_norm": 0.024419404566287994, + "learning_rate": 0.2990229926726223, + "loss": 0.1407, + "num_input_tokens_seen": 1314464, + "step": 1455 + }, + { + "epoch": 0.38537679820509435, + "grad_norm": 0.01959303766489029, + "learning_rate": 0.29901626904092365, + "loss": 0.1592, + "num_input_tokens_seen": 1319168, + "step": 1460 + }, + { + "epoch": 0.38669658176059124, + "grad_norm": 0.017410948872566223, + "learning_rate": 0.2990095224290438, + "loss": 0.1808, + "num_input_tokens_seen": 1323616, + "step": 1465 + }, + { + "epoch": 0.3880163653160882, + "grad_norm": 0.015613419935107231, + "learning_rate": 0.29900275283802297, + "loss": 0.1784, + "num_input_tokens_seen": 1328192, + "step": 1470 + }, + { + "epoch": 0.3893361488715851, + "grad_norm": 0.008762115612626076, + "learning_rate": 0.2989959602689051, + "loss": 0.171, + "num_input_tokens_seen": 1332800, + "step": 1475 + }, + { + "epoch": 0.39065593242708196, + "grad_norm": 0.011385131627321243, + "learning_rate": 0.2989891447227379, + "loss": 0.179, + "num_input_tokens_seen": 1337376, + "step": 1480 + }, + { + "epoch": 0.39197571598257885, + "grad_norm": 0.01010975707322359, + "learning_rate": 0.29898230620057215, + "loss": 0.1714, + "num_input_tokens_seen": 1341888, + "step": 1485 + }, + { + "epoch": 0.39329549953807574, + "grad_norm": 0.0080032330006361, + "learning_rate": 0.2989754447034626, + "loss": 0.14, + "num_input_tokens_seen": 1346592, + "step": 1490 + }, + { + "epoch": 0.39461528309357263, + "grad_norm": 0.012568931095302105, + "learning_rate": 0.2989685602324673, + "loss": 0.1563, + "num_input_tokens_seen": 1351232, + "step": 1495 + }, + { + "epoch": 0.3959350666490696, + "grad_norm": 0.014087880961596966, + "learning_rate": 0.298961652788648, + "loss": 0.1525, + "num_input_tokens_seen": 1355648, + "step": 1500 + }, + { + "epoch": 0.39725485020456647, + "grad_norm": 0.0135092344135046, + "learning_rate": 0.29895472237306986, + "loss": 0.1528, + "num_input_tokens_seen": 1360096, + "step": 1505 + }, + { + "epoch": 0.39857463376006336, + "grad_norm": 0.02037392184138298, + "learning_rate": 0.29894776898680164, + "loss": 0.1768, + "num_input_tokens_seen": 1364640, + "step": 1510 + }, + { + "epoch": 0.39989441731556025, + "grad_norm": 0.010121186263859272, + "learning_rate": 0.29894079263091566, + "loss": 0.1734, + "num_input_tokens_seen": 1369344, + "step": 1515 + }, + { + "epoch": 0.40121420087105714, + "grad_norm": 0.013222168199717999, + "learning_rate": 0.2989337933064877, + "loss": 0.1652, + "num_input_tokens_seen": 1373824, + "step": 1520 + }, + { + "epoch": 0.402533984426554, + "grad_norm": 0.020236413925886154, + "learning_rate": 0.29892677101459725, + "loss": 0.1772, + "num_input_tokens_seen": 1378304, + "step": 1525 + }, + { + "epoch": 0.40385376798205097, + "grad_norm": 0.015420015901327133, + "learning_rate": 0.2989197257563272, + "loss": 0.1389, + "num_input_tokens_seen": 1382784, + "step": 1530 + }, + { + "epoch": 0.40517355153754786, + "grad_norm": 0.016173260286450386, + "learning_rate": 0.2989126575327639, + "loss": 0.172, + "num_input_tokens_seen": 1387424, + "step": 1535 + }, + { + "epoch": 0.40649333509304475, + "grad_norm": 0.012048077769577503, + "learning_rate": 0.29890556634499754, + "loss": 0.1266, + "num_input_tokens_seen": 1392000, + "step": 1540 + }, + { + "epoch": 0.40781311864854164, + "grad_norm": 0.020538749173283577, + "learning_rate": 0.2988984521941216, + "loss": 0.2, + "num_input_tokens_seen": 1396672, + "step": 1545 + }, + { + "epoch": 0.40913290220403853, + "grad_norm": 0.01347770169377327, + "learning_rate": 0.29889131508123307, + "loss": 0.192, + "num_input_tokens_seen": 1401344, + "step": 1550 + }, + { + "epoch": 0.4104526857595354, + "grad_norm": 0.013745669275522232, + "learning_rate": 0.2988841550074327, + "loss": 0.1419, + "num_input_tokens_seen": 1405824, + "step": 1555 + }, + { + "epoch": 0.4117724693150323, + "grad_norm": 0.011017225682735443, + "learning_rate": 0.2988769719738246, + "loss": 0.1729, + "num_input_tokens_seen": 1410304, + "step": 1560 + }, + { + "epoch": 0.41309225287052925, + "grad_norm": 0.009693301282823086, + "learning_rate": 0.29886976598151666, + "loss": 0.1377, + "num_input_tokens_seen": 1415008, + "step": 1565 + }, + { + "epoch": 0.41441203642602614, + "grad_norm": 0.013187972828745842, + "learning_rate": 0.29886253703161986, + "loss": 0.1172, + "num_input_tokens_seen": 1420032, + "step": 1570 + }, + { + "epoch": 0.41573181998152303, + "grad_norm": 0.016357528045773506, + "learning_rate": 0.29885528512524917, + "loss": 0.136, + "num_input_tokens_seen": 1424448, + "step": 1575 + }, + { + "epoch": 0.4170516035370199, + "grad_norm": 0.013520017266273499, + "learning_rate": 0.29884801026352287, + "loss": 0.1585, + "num_input_tokens_seen": 1428992, + "step": 1580 + }, + { + "epoch": 0.4183713870925168, + "grad_norm": 0.021026870235800743, + "learning_rate": 0.2988407124475629, + "loss": 0.1759, + "num_input_tokens_seen": 1433280, + "step": 1585 + }, + { + "epoch": 0.4196911706480137, + "grad_norm": 0.020026179030537605, + "learning_rate": 0.2988333916784945, + "loss": 0.1684, + "num_input_tokens_seen": 1437664, + "step": 1590 + }, + { + "epoch": 0.42101095420351065, + "grad_norm": 0.013080363161861897, + "learning_rate": 0.2988260479574468, + "loss": 0.1664, + "num_input_tokens_seen": 1442560, + "step": 1595 + }, + { + "epoch": 0.42233073775900754, + "grad_norm": 0.011754047125577927, + "learning_rate": 0.2988186812855523, + "loss": 0.1372, + "num_input_tokens_seen": 1447200, + "step": 1600 + }, + { + "epoch": 0.42233073775900754, + "eval_loss": 0.20221038162708282, + "eval_runtime": 75.88, + "eval_samples_per_second": 88.759, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 1447200, + "step": 1600 + }, + { + "epoch": 0.4236505213145044, + "grad_norm": 0.010455409064888954, + "learning_rate": 0.29881129166394693, + "loss": 0.1699, + "num_input_tokens_seen": 1451520, + "step": 1605 + }, + { + "epoch": 0.4249703048700013, + "grad_norm": 0.01423876266926527, + "learning_rate": 0.29880387909377026, + "loss": 0.128, + "num_input_tokens_seen": 1455776, + "step": 1610 + }, + { + "epoch": 0.4262900884254982, + "grad_norm": 0.012579250149428844, + "learning_rate": 0.2987964435761655, + "loss": 0.1697, + "num_input_tokens_seen": 1460448, + "step": 1615 + }, + { + "epoch": 0.4276098719809951, + "grad_norm": 0.012770898640155792, + "learning_rate": 0.29878898511227925, + "loss": 0.15, + "num_input_tokens_seen": 1464928, + "step": 1620 + }, + { + "epoch": 0.42892965553649204, + "grad_norm": 0.018137408420443535, + "learning_rate": 0.2987815037032617, + "loss": 0.1425, + "num_input_tokens_seen": 1469344, + "step": 1625 + }, + { + "epoch": 0.43024943909198893, + "grad_norm": 0.01738237775862217, + "learning_rate": 0.29877399935026655, + "loss": 0.1499, + "num_input_tokens_seen": 1473760, + "step": 1630 + }, + { + "epoch": 0.4315692226474858, + "grad_norm": 0.019674118608236313, + "learning_rate": 0.2987664720544511, + "loss": 0.1304, + "num_input_tokens_seen": 1478208, + "step": 1635 + }, + { + "epoch": 0.4328890062029827, + "grad_norm": 0.025561867281794548, + "learning_rate": 0.2987589218169761, + "loss": 0.2095, + "num_input_tokens_seen": 1482464, + "step": 1640 + }, + { + "epoch": 0.4342087897584796, + "grad_norm": 0.016805540770292282, + "learning_rate": 0.29875134863900604, + "loss": 0.1367, + "num_input_tokens_seen": 1486880, + "step": 1645 + }, + { + "epoch": 0.4355285733139765, + "grad_norm": 0.010505473241209984, + "learning_rate": 0.29874375252170865, + "loss": 0.1618, + "num_input_tokens_seen": 1491552, + "step": 1650 + }, + { + "epoch": 0.43684835686947343, + "grad_norm": 0.014019533060491085, + "learning_rate": 0.2987361334662553, + "loss": 0.1472, + "num_input_tokens_seen": 1496064, + "step": 1655 + }, + { + "epoch": 0.4381681404249703, + "grad_norm": 0.009831941686570644, + "learning_rate": 0.29872849147382113, + "loss": 0.1622, + "num_input_tokens_seen": 1500768, + "step": 1660 + }, + { + "epoch": 0.4394879239804672, + "grad_norm": 0.02941649965941906, + "learning_rate": 0.2987208265455845, + "loss": 0.1877, + "num_input_tokens_seen": 1505216, + "step": 1665 + }, + { + "epoch": 0.4408077075359641, + "grad_norm": 0.007946347817778587, + "learning_rate": 0.29871313868272753, + "loss": 0.1505, + "num_input_tokens_seen": 1509984, + "step": 1670 + }, + { + "epoch": 0.442127491091461, + "grad_norm": 0.009349900297820568, + "learning_rate": 0.29870542788643567, + "loss": 0.1324, + "num_input_tokens_seen": 1514240, + "step": 1675 + }, + { + "epoch": 0.4434472746469579, + "grad_norm": 0.010799652896821499, + "learning_rate": 0.2986976941578981, + "loss": 0.1693, + "num_input_tokens_seen": 1518880, + "step": 1680 + }, + { + "epoch": 0.4447670582024548, + "grad_norm": 0.01512147020548582, + "learning_rate": 0.29868993749830747, + "loss": 0.1622, + "num_input_tokens_seen": 1523264, + "step": 1685 + }, + { + "epoch": 0.4460868417579517, + "grad_norm": 0.008160443045198917, + "learning_rate": 0.2986821579088598, + "loss": 0.1595, + "num_input_tokens_seen": 1527648, + "step": 1690 + }, + { + "epoch": 0.4474066253134486, + "grad_norm": 0.014599068090319633, + "learning_rate": 0.29867435539075504, + "loss": 0.1689, + "num_input_tokens_seen": 1531968, + "step": 1695 + }, + { + "epoch": 0.4487264088689455, + "grad_norm": 0.01145162619650364, + "learning_rate": 0.2986665299451963, + "loss": 0.1442, + "num_input_tokens_seen": 1536416, + "step": 1700 + }, + { + "epoch": 0.4500461924244424, + "grad_norm": 0.027423417195677757, + "learning_rate": 0.29865868157339037, + "loss": 0.2138, + "num_input_tokens_seen": 1541184, + "step": 1705 + }, + { + "epoch": 0.45136597597993927, + "grad_norm": 0.011148041114211082, + "learning_rate": 0.2986508102765476, + "loss": 0.1577, + "num_input_tokens_seen": 1545952, + "step": 1710 + }, + { + "epoch": 0.45268575953543616, + "grad_norm": 0.018826356157660484, + "learning_rate": 0.2986429160558818, + "loss": 0.177, + "num_input_tokens_seen": 1550432, + "step": 1715 + }, + { + "epoch": 0.4540055430909331, + "grad_norm": 0.014223514124751091, + "learning_rate": 0.2986349989126104, + "loss": 0.1357, + "num_input_tokens_seen": 1555040, + "step": 1720 + }, + { + "epoch": 0.45532532664643, + "grad_norm": 0.011184771545231342, + "learning_rate": 0.29862705884795426, + "loss": 0.1617, + "num_input_tokens_seen": 1559936, + "step": 1725 + }, + { + "epoch": 0.4566451102019269, + "grad_norm": 0.01274403277784586, + "learning_rate": 0.2986190958631379, + "loss": 0.1637, + "num_input_tokens_seen": 1564352, + "step": 1730 + }, + { + "epoch": 0.4579648937574238, + "grad_norm": 0.01388278417289257, + "learning_rate": 0.29861110995938933, + "loss": 0.1267, + "num_input_tokens_seen": 1568928, + "step": 1735 + }, + { + "epoch": 0.45928467731292066, + "grad_norm": 0.011775447987020016, + "learning_rate": 0.29860310113794, + "loss": 0.1425, + "num_input_tokens_seen": 1573632, + "step": 1740 + }, + { + "epoch": 0.46060446086841755, + "grad_norm": 0.016194073483347893, + "learning_rate": 0.29859506940002506, + "loss": 0.1563, + "num_input_tokens_seen": 1578208, + "step": 1745 + }, + { + "epoch": 0.4619242444239145, + "grad_norm": 0.009901455603539944, + "learning_rate": 0.298587014746883, + "loss": 0.1273, + "num_input_tokens_seen": 1583296, + "step": 1750 + }, + { + "epoch": 0.4632440279794114, + "grad_norm": 0.014732127077877522, + "learning_rate": 0.298578937179756, + "loss": 0.1429, + "num_input_tokens_seen": 1587904, + "step": 1755 + }, + { + "epoch": 0.4645638115349083, + "grad_norm": 0.015618130564689636, + "learning_rate": 0.29857083669988976, + "loss": 0.1789, + "num_input_tokens_seen": 1592416, + "step": 1760 + }, + { + "epoch": 0.46588359509040517, + "grad_norm": 0.016639621928334236, + "learning_rate": 0.29856271330853346, + "loss": 0.1839, + "num_input_tokens_seen": 1596864, + "step": 1765 + }, + { + "epoch": 0.46720337864590206, + "grad_norm": 0.012982130981981754, + "learning_rate": 0.2985545670069398, + "loss": 0.1858, + "num_input_tokens_seen": 1601504, + "step": 1770 + }, + { + "epoch": 0.46852316220139895, + "grad_norm": 0.006951024755835533, + "learning_rate": 0.29854639779636505, + "loss": 0.1542, + "num_input_tokens_seen": 1606080, + "step": 1775 + }, + { + "epoch": 0.4698429457568959, + "grad_norm": 0.015585050918161869, + "learning_rate": 0.298538205678069, + "loss": 0.1823, + "num_input_tokens_seen": 1610432, + "step": 1780 + }, + { + "epoch": 0.4711627293123928, + "grad_norm": 0.008662565611302853, + "learning_rate": 0.298529990653315, + "loss": 0.146, + "num_input_tokens_seen": 1614848, + "step": 1785 + }, + { + "epoch": 0.47248251286788967, + "grad_norm": 0.005524289328604937, + "learning_rate": 0.29852175272336984, + "loss": 0.1075, + "num_input_tokens_seen": 1619264, + "step": 1790 + }, + { + "epoch": 0.47380229642338656, + "grad_norm": 0.012119464576244354, + "learning_rate": 0.29851349188950405, + "loss": 0.1571, + "num_input_tokens_seen": 1623648, + "step": 1795 + }, + { + "epoch": 0.47512207997888345, + "grad_norm": 0.012851925566792488, + "learning_rate": 0.2985052081529914, + "loss": 0.1373, + "num_input_tokens_seen": 1628352, + "step": 1800 + }, + { + "epoch": 0.47512207997888345, + "eval_loss": 0.14386513829231262, + "eval_runtime": 75.844, + "eval_samples_per_second": 88.801, + "eval_steps_per_second": 22.203, + "num_input_tokens_seen": 1628352, + "step": 1800 + }, + { + "epoch": 0.47644186353438034, + "grad_norm": 0.016318252310156822, + "learning_rate": 0.29849690151510944, + "loss": 0.1358, + "num_input_tokens_seen": 1633088, + "step": 1805 + }, + { + "epoch": 0.4777616470898773, + "grad_norm": 0.014173184521496296, + "learning_rate": 0.2984885719771392, + "loss": 0.174, + "num_input_tokens_seen": 1637824, + "step": 1810 + }, + { + "epoch": 0.4790814306453742, + "grad_norm": 0.007857879623770714, + "learning_rate": 0.2984802195403651, + "loss": 0.122, + "num_input_tokens_seen": 1642240, + "step": 1815 + }, + { + "epoch": 0.48040121420087106, + "grad_norm": 0.005983575247228146, + "learning_rate": 0.2984718442060752, + "loss": 0.1092, + "num_input_tokens_seen": 1646784, + "step": 1820 + }, + { + "epoch": 0.48172099775636795, + "grad_norm": 0.014810864813625813, + "learning_rate": 0.2984634459755611, + "loss": 0.1716, + "num_input_tokens_seen": 1651488, + "step": 1825 + }, + { + "epoch": 0.48304078131186484, + "grad_norm": 0.015910720452666283, + "learning_rate": 0.29845502485011793, + "loss": 0.1721, + "num_input_tokens_seen": 1656192, + "step": 1830 + }, + { + "epoch": 0.48436056486736173, + "grad_norm": 0.012184597551822662, + "learning_rate": 0.2984465808310444, + "loss": 0.161, + "num_input_tokens_seen": 1660864, + "step": 1835 + }, + { + "epoch": 0.4856803484228587, + "grad_norm": 0.01050629187375307, + "learning_rate": 0.29843811391964253, + "loss": 0.1389, + "num_input_tokens_seen": 1665472, + "step": 1840 + }, + { + "epoch": 0.48700013197835557, + "grad_norm": 0.009105296805500984, + "learning_rate": 0.2984296241172182, + "loss": 0.1616, + "num_input_tokens_seen": 1670016, + "step": 1845 + }, + { + "epoch": 0.48831991553385246, + "grad_norm": 0.010969123803079128, + "learning_rate": 0.29842111142508043, + "loss": 0.1238, + "num_input_tokens_seen": 1674496, + "step": 1850 + }, + { + "epoch": 0.48963969908934935, + "grad_norm": 0.011059815064072609, + "learning_rate": 0.29841257584454217, + "loss": 0.1329, + "num_input_tokens_seen": 1678880, + "step": 1855 + }, + { + "epoch": 0.49095948264484623, + "grad_norm": 0.011011156253516674, + "learning_rate": 0.29840401737691963, + "loss": 0.1134, + "num_input_tokens_seen": 1683264, + "step": 1860 + }, + { + "epoch": 0.4922792662003431, + "grad_norm": 0.01630229689180851, + "learning_rate": 0.29839543602353263, + "loss": 0.1577, + "num_input_tokens_seen": 1688000, + "step": 1865 + }, + { + "epoch": 0.49359904975584007, + "grad_norm": 0.009742028079926968, + "learning_rate": 0.2983868317857046, + "loss": 0.1222, + "num_input_tokens_seen": 1692640, + "step": 1870 + }, + { + "epoch": 0.49491883331133696, + "grad_norm": 0.011534149758517742, + "learning_rate": 0.2983782046647623, + "loss": 0.1256, + "num_input_tokens_seen": 1697120, + "step": 1875 + }, + { + "epoch": 0.49623861686683385, + "grad_norm": 0.008358186110854149, + "learning_rate": 0.2983695546620362, + "loss": 0.171, + "num_input_tokens_seen": 1701728, + "step": 1880 + }, + { + "epoch": 0.49755840042233074, + "grad_norm": 0.014756348915398121, + "learning_rate": 0.2983608817788603, + "loss": 0.2091, + "num_input_tokens_seen": 1706208, + "step": 1885 + }, + { + "epoch": 0.4988781839778276, + "grad_norm": 0.018950676545500755, + "learning_rate": 0.29835218601657193, + "loss": 0.1329, + "num_input_tokens_seen": 1710688, + "step": 1890 + }, + { + "epoch": 0.5001979675333246, + "grad_norm": 0.017507467418909073, + "learning_rate": 0.29834346737651224, + "loss": 0.1702, + "num_input_tokens_seen": 1715296, + "step": 1895 + }, + { + "epoch": 0.5015177510888215, + "grad_norm": 0.010053876787424088, + "learning_rate": 0.29833472586002563, + "loss": 0.1528, + "num_input_tokens_seen": 1719776, + "step": 1900 + }, + { + "epoch": 0.5028375346443184, + "grad_norm": 0.008023696020245552, + "learning_rate": 0.29832596146846024, + "loss": 0.176, + "num_input_tokens_seen": 1724320, + "step": 1905 + }, + { + "epoch": 0.5041573181998152, + "grad_norm": 0.009918652474880219, + "learning_rate": 0.2983171742031676, + "loss": 0.1616, + "num_input_tokens_seen": 1728640, + "step": 1910 + }, + { + "epoch": 0.5054771017553121, + "grad_norm": 0.010670655407011509, + "learning_rate": 0.2983083640655028, + "loss": 0.1472, + "num_input_tokens_seen": 1732960, + "step": 1915 + }, + { + "epoch": 0.506796885310809, + "grad_norm": 0.018419809639453888, + "learning_rate": 0.29829953105682455, + "loss": 0.18, + "num_input_tokens_seen": 1737216, + "step": 1920 + }, + { + "epoch": 0.5081166688663059, + "grad_norm": 0.015478893183171749, + "learning_rate": 0.29829067517849495, + "loss": 0.1432, + "num_input_tokens_seen": 1741824, + "step": 1925 + }, + { + "epoch": 0.5094364524218028, + "grad_norm": 0.0076490589417517185, + "learning_rate": 0.2982817964318797, + "loss": 0.1853, + "num_input_tokens_seen": 1746048, + "step": 1930 + }, + { + "epoch": 0.5107562359772997, + "grad_norm": 0.008944222703576088, + "learning_rate": 0.298272894818348, + "loss": 0.1511, + "num_input_tokens_seen": 1750624, + "step": 1935 + }, + { + "epoch": 0.5120760195327966, + "grad_norm": 0.011407849378883839, + "learning_rate": 0.2982639703392726, + "loss": 0.1384, + "num_input_tokens_seen": 1755168, + "step": 1940 + }, + { + "epoch": 0.5133958030882935, + "grad_norm": 0.01807153970003128, + "learning_rate": 0.29825502299602974, + "loss": 0.147, + "num_input_tokens_seen": 1759712, + "step": 1945 + }, + { + "epoch": 0.5147155866437905, + "grad_norm": 0.015131832100450993, + "learning_rate": 0.2982460527899993, + "loss": 0.1583, + "num_input_tokens_seen": 1764352, + "step": 1950 + }, + { + "epoch": 0.5160353701992874, + "grad_norm": 0.009430116042494774, + "learning_rate": 0.29823705972256453, + "loss": 0.1782, + "num_input_tokens_seen": 1768896, + "step": 1955 + }, + { + "epoch": 0.5173551537547842, + "grad_norm": 0.008882543072104454, + "learning_rate": 0.2982280437951123, + "loss": 0.1441, + "num_input_tokens_seen": 1773344, + "step": 1960 + }, + { + "epoch": 0.5186749373102811, + "grad_norm": 0.010477996431291103, + "learning_rate": 0.298219005009033, + "loss": 0.1364, + "num_input_tokens_seen": 1777664, + "step": 1965 + }, + { + "epoch": 0.519994720865778, + "grad_norm": 0.009420307353138924, + "learning_rate": 0.29820994336572043, + "loss": 0.1289, + "num_input_tokens_seen": 1782144, + "step": 1970 + }, + { + "epoch": 0.5213145044212749, + "grad_norm": 0.010934803634881973, + "learning_rate": 0.2982008588665721, + "loss": 0.1266, + "num_input_tokens_seen": 1786624, + "step": 1975 + }, + { + "epoch": 0.5226342879767718, + "grad_norm": 0.016389261931180954, + "learning_rate": 0.2981917515129889, + "loss": 0.1577, + "num_input_tokens_seen": 1791296, + "step": 1980 + }, + { + "epoch": 0.5239540715322687, + "grad_norm": 0.012658503837883472, + "learning_rate": 0.2981826213063753, + "loss": 0.1307, + "num_input_tokens_seen": 1795712, + "step": 1985 + }, + { + "epoch": 0.5252738550877656, + "grad_norm": 0.021145110949873924, + "learning_rate": 0.2981734682481394, + "loss": 0.1889, + "num_input_tokens_seen": 1800224, + "step": 1990 + }, + { + "epoch": 0.5265936386432625, + "grad_norm": 0.026462016627192497, + "learning_rate": 0.29816429233969255, + "loss": 0.2215, + "num_input_tokens_seen": 1804704, + "step": 1995 + }, + { + "epoch": 0.5279134221987594, + "grad_norm": 0.01178019493818283, + "learning_rate": 0.2981550935824499, + "loss": 0.1464, + "num_input_tokens_seen": 1809312, + "step": 2000 + }, + { + "epoch": 0.5279134221987594, + "eval_loss": 0.21287807822227478, + "eval_runtime": 75.8006, + "eval_samples_per_second": 88.852, + "eval_steps_per_second": 22.216, + "num_input_tokens_seen": 1809312, + "step": 2000 + }, + { + "epoch": 0.5292332057542563, + "grad_norm": 0.014209797605872154, + "learning_rate": 0.29814587197783, + "loss": 0.1889, + "num_input_tokens_seen": 1813792, + "step": 2005 + }, + { + "epoch": 0.5305529893097533, + "grad_norm": 0.007567676715552807, + "learning_rate": 0.29813662752725495, + "loss": 0.1267, + "num_input_tokens_seen": 1818464, + "step": 2010 + }, + { + "epoch": 0.5318727728652501, + "grad_norm": 0.01113646849989891, + "learning_rate": 0.29812736023215025, + "loss": 0.1361, + "num_input_tokens_seen": 1822432, + "step": 2015 + }, + { + "epoch": 0.533192556420747, + "grad_norm": 0.010186386294662952, + "learning_rate": 0.29811807009394514, + "loss": 0.1421, + "num_input_tokens_seen": 1827008, + "step": 2020 + }, + { + "epoch": 0.5345123399762439, + "grad_norm": 0.014248755760490894, + "learning_rate": 0.2981087571140723, + "loss": 0.1314, + "num_input_tokens_seen": 1831744, + "step": 2025 + }, + { + "epoch": 0.5358321235317408, + "grad_norm": 0.009855576790869236, + "learning_rate": 0.2980994212939678, + "loss": 0.1343, + "num_input_tokens_seen": 1836832, + "step": 2030 + }, + { + "epoch": 0.5371519070872377, + "grad_norm": 0.00959087535738945, + "learning_rate": 0.2980900626350715, + "loss": 0.1066, + "num_input_tokens_seen": 1841472, + "step": 2035 + }, + { + "epoch": 0.5384716906427346, + "grad_norm": 0.01576186716556549, + "learning_rate": 0.29808068113882646, + "loss": 0.1384, + "num_input_tokens_seen": 1846176, + "step": 2040 + }, + { + "epoch": 0.5397914741982315, + "grad_norm": 0.016577541828155518, + "learning_rate": 0.2980712768066795, + "loss": 0.1444, + "num_input_tokens_seen": 1851072, + "step": 2045 + }, + { + "epoch": 0.5411112577537284, + "grad_norm": 0.018216287717223167, + "learning_rate": 0.2980618496400809, + "loss": 0.1807, + "num_input_tokens_seen": 1855776, + "step": 2050 + }, + { + "epoch": 0.5424310413092253, + "grad_norm": 0.014731143601238728, + "learning_rate": 0.2980523996404844, + "loss": 0.1959, + "num_input_tokens_seen": 1860128, + "step": 2055 + }, + { + "epoch": 0.5437508248647221, + "grad_norm": 0.013703517615795135, + "learning_rate": 0.2980429268093473, + "loss": 0.1762, + "num_input_tokens_seen": 1864608, + "step": 2060 + }, + { + "epoch": 0.545070608420219, + "grad_norm": 0.01233205758035183, + "learning_rate": 0.29803343114813047, + "loss": 0.1711, + "num_input_tokens_seen": 1869120, + "step": 2065 + }, + { + "epoch": 0.5463903919757159, + "grad_norm": 0.007688133977353573, + "learning_rate": 0.2980239126582983, + "loss": 0.1723, + "num_input_tokens_seen": 1873728, + "step": 2070 + }, + { + "epoch": 0.5477101755312129, + "grad_norm": 0.00969452504068613, + "learning_rate": 0.2980143713413186, + "loss": 0.1356, + "num_input_tokens_seen": 1878272, + "step": 2075 + }, + { + "epoch": 0.5490299590867098, + "grad_norm": 0.020931722596287727, + "learning_rate": 0.29800480719866274, + "loss": 0.1514, + "num_input_tokens_seen": 1882432, + "step": 2080 + }, + { + "epoch": 0.5503497426422067, + "grad_norm": 0.008838964626193047, + "learning_rate": 0.2979952202318057, + "loss": 0.1262, + "num_input_tokens_seen": 1887008, + "step": 2085 + }, + { + "epoch": 0.5516695261977036, + "grad_norm": 0.014631135389208794, + "learning_rate": 0.2979856104422259, + "loss": 0.1933, + "num_input_tokens_seen": 1891552, + "step": 2090 + }, + { + "epoch": 0.5529893097532005, + "grad_norm": 0.014917601831257343, + "learning_rate": 0.2979759778314052, + "loss": 0.1211, + "num_input_tokens_seen": 1895936, + "step": 2095 + }, + { + "epoch": 0.5543090933086974, + "grad_norm": 0.007737384177744389, + "learning_rate": 0.2979663224008292, + "loss": 0.1426, + "num_input_tokens_seen": 1900448, + "step": 2100 + }, + { + "epoch": 0.5556288768641943, + "grad_norm": 0.005941353272646666, + "learning_rate": 0.2979566441519868, + "loss": 0.1416, + "num_input_tokens_seen": 1904992, + "step": 2105 + }, + { + "epoch": 0.5569486604196912, + "grad_norm": 0.013481711968779564, + "learning_rate": 0.29794694308637054, + "loss": 0.1363, + "num_input_tokens_seen": 1909664, + "step": 2110 + }, + { + "epoch": 0.558268443975188, + "grad_norm": 0.010350354947149754, + "learning_rate": 0.2979372192054764, + "loss": 0.1446, + "num_input_tokens_seen": 1914464, + "step": 2115 + }, + { + "epoch": 0.5595882275306849, + "grad_norm": 0.012060332112014294, + "learning_rate": 0.297927472510804, + "loss": 0.1995, + "num_input_tokens_seen": 1918816, + "step": 2120 + }, + { + "epoch": 0.5609080110861818, + "grad_norm": 0.007879454642534256, + "learning_rate": 0.29791770300385634, + "loss": 0.1614, + "num_input_tokens_seen": 1923424, + "step": 2125 + }, + { + "epoch": 0.5622277946416787, + "grad_norm": 0.009927273727953434, + "learning_rate": 0.29790791068614003, + "loss": 0.1906, + "num_input_tokens_seen": 1927776, + "step": 2130 + }, + { + "epoch": 0.5635475781971757, + "grad_norm": 0.008347705006599426, + "learning_rate": 0.2978980955591652, + "loss": 0.2008, + "num_input_tokens_seen": 1932480, + "step": 2135 + }, + { + "epoch": 0.5648673617526726, + "grad_norm": 0.00940942857414484, + "learning_rate": 0.2978882576244454, + "loss": 0.2052, + "num_input_tokens_seen": 1936896, + "step": 2140 + }, + { + "epoch": 0.5661871453081695, + "grad_norm": 0.0080792885273695, + "learning_rate": 0.2978783968834978, + "loss": 0.1439, + "num_input_tokens_seen": 1941472, + "step": 2145 + }, + { + "epoch": 0.5675069288636664, + "grad_norm": 0.013207337819039822, + "learning_rate": 0.29786851333784303, + "loss": 0.1333, + "num_input_tokens_seen": 1946272, + "step": 2150 + }, + { + "epoch": 0.5688267124191633, + "grad_norm": 0.008294310420751572, + "learning_rate": 0.2978586069890053, + "loss": 0.1308, + "num_input_tokens_seen": 1951040, + "step": 2155 + }, + { + "epoch": 0.5701464959746602, + "grad_norm": 0.01478321198374033, + "learning_rate": 0.29784867783851227, + "loss": 0.1645, + "num_input_tokens_seen": 1956096, + "step": 2160 + }, + { + "epoch": 0.571466279530157, + "grad_norm": 0.018088769167661667, + "learning_rate": 0.2978387258878951, + "loss": 0.1591, + "num_input_tokens_seen": 1960480, + "step": 2165 + }, + { + "epoch": 0.5727860630856539, + "grad_norm": 0.015504246577620506, + "learning_rate": 0.29782875113868856, + "loss": 0.1577, + "num_input_tokens_seen": 1965056, + "step": 2170 + }, + { + "epoch": 0.5741058466411508, + "grad_norm": 0.015700412914156914, + "learning_rate": 0.2978187535924309, + "loss": 0.1486, + "num_input_tokens_seen": 1969504, + "step": 2175 + }, + { + "epoch": 0.5754256301966477, + "grad_norm": 0.00555170513689518, + "learning_rate": 0.29780873325066376, + "loss": 0.144, + "num_input_tokens_seen": 1973984, + "step": 2180 + }, + { + "epoch": 0.5767454137521446, + "grad_norm": 0.00753447413444519, + "learning_rate": 0.2977986901149325, + "loss": 0.128, + "num_input_tokens_seen": 1978464, + "step": 2185 + }, + { + "epoch": 0.5780651973076415, + "grad_norm": 0.014069327153265476, + "learning_rate": 0.29778862418678587, + "loss": 0.1421, + "num_input_tokens_seen": 1983104, + "step": 2190 + }, + { + "epoch": 0.5793849808631385, + "grad_norm": 0.01679610274732113, + "learning_rate": 0.29777853546777616, + "loss": 0.171, + "num_input_tokens_seen": 1987744, + "step": 2195 + }, + { + "epoch": 0.5807047644186354, + "grad_norm": 0.006662668660283089, + "learning_rate": 0.2977684239594592, + "loss": 0.1684, + "num_input_tokens_seen": 1992416, + "step": 2200 + }, + { + "epoch": 0.5807047644186354, + "eval_loss": 0.14221368730068207, + "eval_runtime": 75.7921, + "eval_samples_per_second": 88.861, + "eval_steps_per_second": 22.219, + "num_input_tokens_seen": 1992416, + "step": 2200 + }, + { + "epoch": 0.5820245479741323, + "grad_norm": 0.009130790829658508, + "learning_rate": 0.29775828966339424, + "loss": 0.159, + "num_input_tokens_seen": 1996864, + "step": 2205 + }, + { + "epoch": 0.5833443315296292, + "grad_norm": 0.007163458038121462, + "learning_rate": 0.29774813258114424, + "loss": 0.0966, + "num_input_tokens_seen": 2001248, + "step": 2210 + }, + { + "epoch": 0.5846641150851261, + "grad_norm": 0.00339997885748744, + "learning_rate": 0.29773795271427544, + "loss": 0.0954, + "num_input_tokens_seen": 2005792, + "step": 2215 + }, + { + "epoch": 0.585983898640623, + "grad_norm": 0.01678043231368065, + "learning_rate": 0.2977277500643577, + "loss": 0.2198, + "num_input_tokens_seen": 2010592, + "step": 2220 + }, + { + "epoch": 0.5873036821961198, + "grad_norm": 0.010628641583025455, + "learning_rate": 0.29771752463296447, + "loss": 0.1283, + "num_input_tokens_seen": 2015040, + "step": 2225 + }, + { + "epoch": 0.5886234657516167, + "grad_norm": 0.013831277377903461, + "learning_rate": 0.29770727642167266, + "loss": 0.13, + "num_input_tokens_seen": 2019552, + "step": 2230 + }, + { + "epoch": 0.5899432493071136, + "grad_norm": 0.009185788221657276, + "learning_rate": 0.29769700543206257, + "loss": 0.1536, + "num_input_tokens_seen": 2023904, + "step": 2235 + }, + { + "epoch": 0.5912630328626105, + "grad_norm": 0.01269163005053997, + "learning_rate": 0.2976867116657182, + "loss": 0.1438, + "num_input_tokens_seen": 2028416, + "step": 2240 + }, + { + "epoch": 0.5925828164181074, + "grad_norm": 0.009287577122449875, + "learning_rate": 0.2976763951242269, + "loss": 0.1379, + "num_input_tokens_seen": 2032864, + "step": 2245 + }, + { + "epoch": 0.5939025999736043, + "grad_norm": 0.0065643866546452045, + "learning_rate": 0.29766605580917965, + "loss": 0.1273, + "num_input_tokens_seen": 2037664, + "step": 2250 + }, + { + "epoch": 0.5952223835291012, + "grad_norm": 0.013082236982882023, + "learning_rate": 0.29765569372217093, + "loss": 0.1811, + "num_input_tokens_seen": 2042144, + "step": 2255 + }, + { + "epoch": 0.5965421670845982, + "grad_norm": 0.014224155806005001, + "learning_rate": 0.2976453088647987, + "loss": 0.1106, + "num_input_tokens_seen": 2046144, + "step": 2260 + }, + { + "epoch": 0.5978619506400951, + "grad_norm": 0.006146958097815514, + "learning_rate": 0.2976349012386644, + "loss": 0.1518, + "num_input_tokens_seen": 2050464, + "step": 2265 + }, + { + "epoch": 0.599181734195592, + "grad_norm": 0.0072912960313260555, + "learning_rate": 0.29762447084537297, + "loss": 0.1499, + "num_input_tokens_seen": 2054656, + "step": 2270 + }, + { + "epoch": 0.6005015177510888, + "grad_norm": 0.009555457159876823, + "learning_rate": 0.29761401768653306, + "loss": 0.1249, + "num_input_tokens_seen": 2059232, + "step": 2275 + }, + { + "epoch": 0.6018213013065857, + "grad_norm": 0.00849615503102541, + "learning_rate": 0.29760354176375653, + "loss": 0.104, + "num_input_tokens_seen": 2063552, + "step": 2280 + }, + { + "epoch": 0.6031410848620826, + "grad_norm": 0.015305159613490105, + "learning_rate": 0.29759304307865897, + "loss": 0.1437, + "num_input_tokens_seen": 2067744, + "step": 2285 + }, + { + "epoch": 0.6044608684175795, + "grad_norm": 0.012740511447191238, + "learning_rate": 0.2975825216328594, + "loss": 0.1368, + "num_input_tokens_seen": 2072256, + "step": 2290 + }, + { + "epoch": 0.6057806519730764, + "grad_norm": 0.01045696809887886, + "learning_rate": 0.2975719774279804, + "loss": 0.1324, + "num_input_tokens_seen": 2077088, + "step": 2295 + }, + { + "epoch": 0.6071004355285733, + "grad_norm": 0.008735966868698597, + "learning_rate": 0.29756141046564794, + "loss": 0.124, + "num_input_tokens_seen": 2081760, + "step": 2300 + }, + { + "epoch": 0.6084202190840702, + "grad_norm": 0.01414476241916418, + "learning_rate": 0.2975508207474916, + "loss": 0.1713, + "num_input_tokens_seen": 2085888, + "step": 2305 + }, + { + "epoch": 0.6097400026395671, + "grad_norm": 0.007390377577394247, + "learning_rate": 0.2975402082751445, + "loss": 0.1296, + "num_input_tokens_seen": 2090240, + "step": 2310 + }, + { + "epoch": 0.611059786195064, + "grad_norm": 0.005465405061841011, + "learning_rate": 0.29752957305024313, + "loss": 0.1286, + "num_input_tokens_seen": 2094432, + "step": 2315 + }, + { + "epoch": 0.612379569750561, + "grad_norm": 0.015551768243312836, + "learning_rate": 0.2975189150744277, + "loss": 0.1544, + "num_input_tokens_seen": 2098816, + "step": 2320 + }, + { + "epoch": 0.6136993533060578, + "grad_norm": 0.018577303737401962, + "learning_rate": 0.29750823434934165, + "loss": 0.1555, + "num_input_tokens_seen": 2103456, + "step": 2325 + }, + { + "epoch": 0.6150191368615547, + "grad_norm": 0.007249258458614349, + "learning_rate": 0.29749753087663217, + "loss": 0.1297, + "num_input_tokens_seen": 2108384, + "step": 2330 + }, + { + "epoch": 0.6163389204170516, + "grad_norm": 0.005622681695967913, + "learning_rate": 0.29748680465794985, + "loss": 0.1275, + "num_input_tokens_seen": 2112896, + "step": 2335 + }, + { + "epoch": 0.6176587039725485, + "grad_norm": 0.006864288356155157, + "learning_rate": 0.29747605569494884, + "loss": 0.1332, + "num_input_tokens_seen": 2117600, + "step": 2340 + }, + { + "epoch": 0.6189784875280454, + "grad_norm": 0.01258816011250019, + "learning_rate": 0.29746528398928673, + "loss": 0.1475, + "num_input_tokens_seen": 2122144, + "step": 2345 + }, + { + "epoch": 0.6202982710835423, + "grad_norm": 0.01215493306517601, + "learning_rate": 0.2974544895426247, + "loss": 0.1292, + "num_input_tokens_seen": 2126720, + "step": 2350 + }, + { + "epoch": 0.6216180546390392, + "grad_norm": 0.01050113420933485, + "learning_rate": 0.29744367235662733, + "loss": 0.1476, + "num_input_tokens_seen": 2131232, + "step": 2355 + }, + { + "epoch": 0.6229378381945361, + "grad_norm": 0.009394998662173748, + "learning_rate": 0.29743283243296276, + "loss": 0.1155, + "num_input_tokens_seen": 2135488, + "step": 2360 + }, + { + "epoch": 0.624257621750033, + "grad_norm": 0.010189434513449669, + "learning_rate": 0.29742196977330276, + "loss": 0.1122, + "num_input_tokens_seen": 2140064, + "step": 2365 + }, + { + "epoch": 0.6255774053055299, + "grad_norm": 0.006216110195964575, + "learning_rate": 0.2974110843793223, + "loss": 0.1207, + "num_input_tokens_seen": 2144896, + "step": 2370 + }, + { + "epoch": 0.6268971888610267, + "grad_norm": 0.007494955323636532, + "learning_rate": 0.2974001762527002, + "loss": 0.0967, + "num_input_tokens_seen": 2149376, + "step": 2375 + }, + { + "epoch": 0.6282169724165237, + "grad_norm": 0.021241893991827965, + "learning_rate": 0.2973892453951186, + "loss": 0.1636, + "num_input_tokens_seen": 2153920, + "step": 2380 + }, + { + "epoch": 0.6295367559720206, + "grad_norm": 0.010070257820189, + "learning_rate": 0.2973782918082631, + "loss": 0.1336, + "num_input_tokens_seen": 2158464, + "step": 2385 + }, + { + "epoch": 0.6308565395275175, + "grad_norm": 0.010666589252650738, + "learning_rate": 0.29736731549382295, + "loss": 0.0985, + "num_input_tokens_seen": 2162944, + "step": 2390 + }, + { + "epoch": 0.6321763230830144, + "grad_norm": 0.007234958931803703, + "learning_rate": 0.2973563164534908, + "loss": 0.1546, + "num_input_tokens_seen": 2167232, + "step": 2395 + }, + { + "epoch": 0.6334961066385113, + "grad_norm": 0.012964232824742794, + "learning_rate": 0.29734529468896287, + "loss": 0.1122, + "num_input_tokens_seen": 2171744, + "step": 2400 + }, + { + "epoch": 0.6334961066385113, + "eval_loss": 0.1412462443113327, + "eval_runtime": 75.7803, + "eval_samples_per_second": 88.875, + "eval_steps_per_second": 22.222, + "num_input_tokens_seen": 2171744, + "step": 2400 + }, + { + "epoch": 0.6348158901940082, + "grad_norm": 0.013453594408929348, + "learning_rate": 0.2973342502019388, + "loss": 0.1769, + "num_input_tokens_seen": 2176096, + "step": 2405 + }, + { + "epoch": 0.6361356737495051, + "grad_norm": 0.010410093702375889, + "learning_rate": 0.2973231829941219, + "loss": 0.1386, + "num_input_tokens_seen": 2180448, + "step": 2410 + }, + { + "epoch": 0.637455457305002, + "grad_norm": 0.008238626644015312, + "learning_rate": 0.2973120930672188, + "loss": 0.1201, + "num_input_tokens_seen": 2185248, + "step": 2415 + }, + { + "epoch": 0.6387752408604989, + "grad_norm": 0.0077172392047941685, + "learning_rate": 0.2973009804229397, + "loss": 0.146, + "num_input_tokens_seen": 2189760, + "step": 2420 + }, + { + "epoch": 0.6400950244159958, + "grad_norm": 0.012194694019854069, + "learning_rate": 0.29728984506299827, + "loss": 0.1552, + "num_input_tokens_seen": 2194432, + "step": 2425 + }, + { + "epoch": 0.6414148079714926, + "grad_norm": 0.009669707156717777, + "learning_rate": 0.2972786869891118, + "loss": 0.1346, + "num_input_tokens_seen": 2198912, + "step": 2430 + }, + { + "epoch": 0.6427345915269895, + "grad_norm": 0.006221890449523926, + "learning_rate": 0.29726750620300096, + "loss": 0.117, + "num_input_tokens_seen": 2203616, + "step": 2435 + }, + { + "epoch": 0.6440543750824864, + "grad_norm": 0.0052872225642204285, + "learning_rate": 0.29725630270639003, + "loss": 0.1248, + "num_input_tokens_seen": 2208256, + "step": 2440 + }, + { + "epoch": 0.6453741586379834, + "grad_norm": 0.014620673842728138, + "learning_rate": 0.2972450765010067, + "loss": 0.1108, + "num_input_tokens_seen": 2212704, + "step": 2445 + }, + { + "epoch": 0.6466939421934803, + "grad_norm": 0.006875451188534498, + "learning_rate": 0.29723382758858213, + "loss": 0.0978, + "num_input_tokens_seen": 2217152, + "step": 2450 + }, + { + "epoch": 0.6480137257489772, + "grad_norm": 0.00837685540318489, + "learning_rate": 0.29722255597085107, + "loss": 0.1111, + "num_input_tokens_seen": 2221696, + "step": 2455 + }, + { + "epoch": 0.6493335093044741, + "grad_norm": 0.017361611127853394, + "learning_rate": 0.2972112616495518, + "loss": 0.151, + "num_input_tokens_seen": 2225888, + "step": 2460 + }, + { + "epoch": 0.650653292859971, + "grad_norm": 0.00874682143330574, + "learning_rate": 0.297199944626426, + "loss": 0.1168, + "num_input_tokens_seen": 2230240, + "step": 2465 + }, + { + "epoch": 0.6519730764154679, + "grad_norm": 0.00815208163112402, + "learning_rate": 0.2971886049032189, + "loss": 0.1075, + "num_input_tokens_seen": 2235040, + "step": 2470 + }, + { + "epoch": 0.6532928599709648, + "grad_norm": 0.012338695116341114, + "learning_rate": 0.29717724248167926, + "loss": 0.1351, + "num_input_tokens_seen": 2239488, + "step": 2475 + }, + { + "epoch": 0.6546126435264616, + "grad_norm": 0.00645091850310564, + "learning_rate": 0.29716585736355927, + "loss": 0.1383, + "num_input_tokens_seen": 2243808, + "step": 2480 + }, + { + "epoch": 0.6559324270819585, + "grad_norm": 0.013032311573624611, + "learning_rate": 0.2971544495506147, + "loss": 0.121, + "num_input_tokens_seen": 2248448, + "step": 2485 + }, + { + "epoch": 0.6572522106374554, + "grad_norm": 0.005674304440617561, + "learning_rate": 0.2971430190446048, + "loss": 0.1147, + "num_input_tokens_seen": 2253056, + "step": 2490 + }, + { + "epoch": 0.6585719941929523, + "grad_norm": 0.005971104837954044, + "learning_rate": 0.2971315658472921, + "loss": 0.1714, + "num_input_tokens_seen": 2257504, + "step": 2495 + }, + { + "epoch": 0.6598917777484492, + "grad_norm": 0.005414428189396858, + "learning_rate": 0.2971200899604431, + "loss": 0.1098, + "num_input_tokens_seen": 2261952, + "step": 2500 + }, + { + "epoch": 0.6612115613039462, + "grad_norm": 0.005849938374012709, + "learning_rate": 0.29710859138582735, + "loss": 0.0925, + "num_input_tokens_seen": 2266208, + "step": 2505 + }, + { + "epoch": 0.6625313448594431, + "grad_norm": 0.008009469136595726, + "learning_rate": 0.29709707012521813, + "loss": 0.1496, + "num_input_tokens_seen": 2270592, + "step": 2510 + }, + { + "epoch": 0.66385112841494, + "grad_norm": 0.015158958733081818, + "learning_rate": 0.29708552618039213, + "loss": 0.195, + "num_input_tokens_seen": 2275168, + "step": 2515 + }, + { + "epoch": 0.6651709119704369, + "grad_norm": 0.006461248733103275, + "learning_rate": 0.2970739595531296, + "loss": 0.1296, + "num_input_tokens_seen": 2279520, + "step": 2520 + }, + { + "epoch": 0.6664906955259338, + "grad_norm": 0.008321602828800678, + "learning_rate": 0.2970623702452143, + "loss": 0.1461, + "num_input_tokens_seen": 2284000, + "step": 2525 + }, + { + "epoch": 0.6678104790814307, + "grad_norm": 0.008197221904993057, + "learning_rate": 0.2970507582584334, + "loss": 0.1306, + "num_input_tokens_seen": 2288704, + "step": 2530 + }, + { + "epoch": 0.6691302626369275, + "grad_norm": 0.008507643826305866, + "learning_rate": 0.2970391235945776, + "loss": 0.1306, + "num_input_tokens_seen": 2293120, + "step": 2535 + }, + { + "epoch": 0.6704500461924244, + "grad_norm": 0.016884298995137215, + "learning_rate": 0.2970274662554412, + "loss": 0.1594, + "num_input_tokens_seen": 2297792, + "step": 2540 + }, + { + "epoch": 0.6717698297479213, + "grad_norm": 0.0146127725020051, + "learning_rate": 0.2970157862428218, + "loss": 0.1442, + "num_input_tokens_seen": 2302304, + "step": 2545 + }, + { + "epoch": 0.6730896133034182, + "grad_norm": 0.01232364121824503, + "learning_rate": 0.2970040835585206, + "loss": 0.1398, + "num_input_tokens_seen": 2307072, + "step": 2550 + }, + { + "epoch": 0.6744093968589151, + "grad_norm": 0.011160590685904026, + "learning_rate": 0.2969923582043424, + "loss": 0.1517, + "num_input_tokens_seen": 2311680, + "step": 2555 + }, + { + "epoch": 0.675729180414412, + "grad_norm": 0.006129463668912649, + "learning_rate": 0.2969806101820953, + "loss": 0.1559, + "num_input_tokens_seen": 2315904, + "step": 2560 + }, + { + "epoch": 0.677048963969909, + "grad_norm": 0.007556830998510122, + "learning_rate": 0.2969688394935911, + "loss": 0.1428, + "num_input_tokens_seen": 2320864, + "step": 2565 + }, + { + "epoch": 0.6783687475254059, + "grad_norm": 0.007181066554039717, + "learning_rate": 0.2969570461406449, + "loss": 0.095, + "num_input_tokens_seen": 2325312, + "step": 2570 + }, + { + "epoch": 0.6796885310809028, + "grad_norm": 0.005729834549129009, + "learning_rate": 0.29694523012507534, + "loss": 0.1195, + "num_input_tokens_seen": 2329856, + "step": 2575 + }, + { + "epoch": 0.6810083146363997, + "grad_norm": 0.010792624205350876, + "learning_rate": 0.2969333914487048, + "loss": 0.1518, + "num_input_tokens_seen": 2334336, + "step": 2580 + }, + { + "epoch": 0.6823280981918965, + "grad_norm": 0.012832139618694782, + "learning_rate": 0.2969215301133587, + "loss": 0.1058, + "num_input_tokens_seen": 2338848, + "step": 2585 + }, + { + "epoch": 0.6836478817473934, + "grad_norm": 0.009142483584582806, + "learning_rate": 0.29690964612086634, + "loss": 0.0883, + "num_input_tokens_seen": 2343072, + "step": 2590 + }, + { + "epoch": 0.6849676653028903, + "grad_norm": 0.011859918013215065, + "learning_rate": 0.2968977394730604, + "loss": 0.1308, + "num_input_tokens_seen": 2348000, + "step": 2595 + }, + { + "epoch": 0.6862874488583872, + "grad_norm": 0.0075095659121870995, + "learning_rate": 0.296885810171777, + "loss": 0.1436, + "num_input_tokens_seen": 2352352, + "step": 2600 + }, + { + "epoch": 0.6862874488583872, + "eval_loss": 0.1598982810974121, + "eval_runtime": 75.7952, + "eval_samples_per_second": 88.858, + "eval_steps_per_second": 22.218, + "num_input_tokens_seen": 2352352, + "step": 2600 + }, + { + "epoch": 0.6876072324138841, + "grad_norm": 0.012599133886396885, + "learning_rate": 0.2968738582188558, + "loss": 0.1945, + "num_input_tokens_seen": 2357152, + "step": 2605 + }, + { + "epoch": 0.688927015969381, + "grad_norm": 0.018023423850536346, + "learning_rate": 0.2968618836161399, + "loss": 0.1945, + "num_input_tokens_seen": 2361696, + "step": 2610 + }, + { + "epoch": 0.6902467995248779, + "grad_norm": 0.010280471295118332, + "learning_rate": 0.296849886365476, + "loss": 0.1518, + "num_input_tokens_seen": 2365984, + "step": 2615 + }, + { + "epoch": 0.6915665830803748, + "grad_norm": 0.007083119358867407, + "learning_rate": 0.2968378664687142, + "loss": 0.2004, + "num_input_tokens_seen": 2370592, + "step": 2620 + }, + { + "epoch": 0.6928863666358717, + "grad_norm": 0.011091338470578194, + "learning_rate": 0.296825823927708, + "loss": 0.1604, + "num_input_tokens_seen": 2375200, + "step": 2625 + }, + { + "epoch": 0.6942061501913687, + "grad_norm": 0.014347190037369728, + "learning_rate": 0.29681375874431476, + "loss": 0.1269, + "num_input_tokens_seen": 2379552, + "step": 2630 + }, + { + "epoch": 0.6955259337468656, + "grad_norm": 0.007958282716572285, + "learning_rate": 0.29680167092039483, + "loss": 0.143, + "num_input_tokens_seen": 2384032, + "step": 2635 + }, + { + "epoch": 0.6968457173023624, + "grad_norm": 0.009662545286118984, + "learning_rate": 0.2967895604578125, + "loss": 0.1086, + "num_input_tokens_seen": 2388576, + "step": 2640 + }, + { + "epoch": 0.6981655008578593, + "grad_norm": 0.013394586741924286, + "learning_rate": 0.2967774273584352, + "loss": 0.1613, + "num_input_tokens_seen": 2392864, + "step": 2645 + }, + { + "epoch": 0.6994852844133562, + "grad_norm": 0.00733777042478323, + "learning_rate": 0.2967652716241342, + "loss": 0.1326, + "num_input_tokens_seen": 2397216, + "step": 2650 + }, + { + "epoch": 0.7008050679688531, + "grad_norm": 0.011822053231298923, + "learning_rate": 0.29675309325678384, + "loss": 0.1592, + "num_input_tokens_seen": 2401632, + "step": 2655 + }, + { + "epoch": 0.70212485152435, + "grad_norm": 0.0077966670505702496, + "learning_rate": 0.29674089225826233, + "loss": 0.1473, + "num_input_tokens_seen": 2406272, + "step": 2660 + }, + { + "epoch": 0.7034446350798469, + "grad_norm": 0.008523115888237953, + "learning_rate": 0.29672866863045116, + "loss": 0.1973, + "num_input_tokens_seen": 2411008, + "step": 2665 + }, + { + "epoch": 0.7047644186353438, + "grad_norm": 0.009427216835319996, + "learning_rate": 0.2967164223752354, + "loss": 0.1371, + "num_input_tokens_seen": 2415552, + "step": 2670 + }, + { + "epoch": 0.7060842021908407, + "grad_norm": 0.006809368263930082, + "learning_rate": 0.2967041534945035, + "loss": 0.1494, + "num_input_tokens_seen": 2419872, + "step": 2675 + }, + { + "epoch": 0.7074039857463376, + "grad_norm": 0.006714246701449156, + "learning_rate": 0.2966918619901476, + "loss": 0.1846, + "num_input_tokens_seen": 2424448, + "step": 2680 + }, + { + "epoch": 0.7087237693018344, + "grad_norm": 0.0066482070833444595, + "learning_rate": 0.2966795478640631, + "loss": 0.1221, + "num_input_tokens_seen": 2428992, + "step": 2685 + }, + { + "epoch": 0.7100435528573315, + "grad_norm": 0.011060399934649467, + "learning_rate": 0.29666721111814903, + "loss": 0.1548, + "num_input_tokens_seen": 2433440, + "step": 2690 + }, + { + "epoch": 0.7113633364128283, + "grad_norm": 0.00633785966783762, + "learning_rate": 0.2966548517543079, + "loss": 0.1167, + "num_input_tokens_seen": 2437568, + "step": 2695 + }, + { + "epoch": 0.7126831199683252, + "grad_norm": 0.011970357038080692, + "learning_rate": 0.29664246977444564, + "loss": 0.1326, + "num_input_tokens_seen": 2441984, + "step": 2700 + }, + { + "epoch": 0.7140029035238221, + "grad_norm": 0.008412486873567104, + "learning_rate": 0.2966300651804717, + "loss": 0.1492, + "num_input_tokens_seen": 2446560, + "step": 2705 + }, + { + "epoch": 0.715322687079319, + "grad_norm": 0.01985471136868, + "learning_rate": 0.296617637974299, + "loss": 0.1196, + "num_input_tokens_seen": 2450848, + "step": 2710 + }, + { + "epoch": 0.7166424706348159, + "grad_norm": 0.007860824465751648, + "learning_rate": 0.2966051881578441, + "loss": 0.1055, + "num_input_tokens_seen": 2455296, + "step": 2715 + }, + { + "epoch": 0.7179622541903128, + "grad_norm": 0.017181284725666046, + "learning_rate": 0.29659271573302676, + "loss": 0.1632, + "num_input_tokens_seen": 2459872, + "step": 2720 + }, + { + "epoch": 0.7192820377458097, + "grad_norm": 0.00761437090113759, + "learning_rate": 0.2965802207017705, + "loss": 0.124, + "num_input_tokens_seen": 2464128, + "step": 2725 + }, + { + "epoch": 0.7206018213013066, + "grad_norm": 0.011663478799164295, + "learning_rate": 0.2965677030660021, + "loss": 0.1503, + "num_input_tokens_seen": 2468704, + "step": 2730 + }, + { + "epoch": 0.7219216048568035, + "grad_norm": 0.00875316746532917, + "learning_rate": 0.2965551628276521, + "loss": 0.1379, + "num_input_tokens_seen": 2473120, + "step": 2735 + }, + { + "epoch": 0.7232413884123003, + "grad_norm": 0.008121983148157597, + "learning_rate": 0.29654259998865423, + "loss": 0.1247, + "num_input_tokens_seen": 2477536, + "step": 2740 + }, + { + "epoch": 0.7245611719677972, + "grad_norm": 0.010762367397546768, + "learning_rate": 0.2965300145509458, + "loss": 0.1835, + "num_input_tokens_seen": 2481856, + "step": 2745 + }, + { + "epoch": 0.7258809555232941, + "grad_norm": 0.004774970002472401, + "learning_rate": 0.2965174065164678, + "loss": 0.1359, + "num_input_tokens_seen": 2486208, + "step": 2750 + }, + { + "epoch": 0.7272007390787911, + "grad_norm": 0.014603430405259132, + "learning_rate": 0.2965047758871644, + "loss": 0.1414, + "num_input_tokens_seen": 2491104, + "step": 2755 + }, + { + "epoch": 0.728520522634288, + "grad_norm": 0.01050829142332077, + "learning_rate": 0.2964921226649835, + "loss": 0.1416, + "num_input_tokens_seen": 2495488, + "step": 2760 + }, + { + "epoch": 0.7298403061897849, + "grad_norm": 0.014131473377346992, + "learning_rate": 0.2964794468518763, + "loss": 0.1331, + "num_input_tokens_seen": 2500160, + "step": 2765 + }, + { + "epoch": 0.7311600897452818, + "grad_norm": 0.013707374222576618, + "learning_rate": 0.2964667484497977, + "loss": 0.1331, + "num_input_tokens_seen": 2504640, + "step": 2770 + }, + { + "epoch": 0.7324798733007787, + "grad_norm": 0.015272718854248524, + "learning_rate": 0.29645402746070587, + "loss": 0.1858, + "num_input_tokens_seen": 2509120, + "step": 2775 + }, + { + "epoch": 0.7337996568562756, + "grad_norm": 0.006447607651352882, + "learning_rate": 0.2964412838865625, + "loss": 0.1321, + "num_input_tokens_seen": 2513664, + "step": 2780 + }, + { + "epoch": 0.7351194404117725, + "grad_norm": 0.007825171574950218, + "learning_rate": 0.29642851772933293, + "loss": 0.1294, + "num_input_tokens_seen": 2518304, + "step": 2785 + }, + { + "epoch": 0.7364392239672694, + "grad_norm": 0.008840548805892467, + "learning_rate": 0.29641572899098567, + "loss": 0.1297, + "num_input_tokens_seen": 2523072, + "step": 2790 + }, + { + "epoch": 0.7377590075227662, + "grad_norm": 0.014507043175399303, + "learning_rate": 0.29640291767349314, + "loss": 0.1329, + "num_input_tokens_seen": 2527616, + "step": 2795 + }, + { + "epoch": 0.7390787910782631, + "grad_norm": 0.008153405971825123, + "learning_rate": 0.2963900837788308, + "loss": 0.1558, + "num_input_tokens_seen": 2532128, + "step": 2800 + }, + { + "epoch": 0.7390787910782631, + "eval_loss": 0.1569027155637741, + "eval_runtime": 75.7139, + "eval_samples_per_second": 88.953, + "eval_steps_per_second": 22.242, + "num_input_tokens_seen": 2532128, + "step": 2800 + }, + { + "epoch": 0.74039857463376, + "grad_norm": 0.008673243224620819, + "learning_rate": 0.2963772273089779, + "loss": 0.1678, + "num_input_tokens_seen": 2536544, + "step": 2805 + }, + { + "epoch": 0.7417183581892569, + "grad_norm": 0.007443676702678204, + "learning_rate": 0.2963643482659171, + "loss": 0.1771, + "num_input_tokens_seen": 2540768, + "step": 2810 + }, + { + "epoch": 0.7430381417447539, + "grad_norm": 0.009915649890899658, + "learning_rate": 0.2963514466516345, + "loss": 0.1893, + "num_input_tokens_seen": 2545440, + "step": 2815 + }, + { + "epoch": 0.7443579253002508, + "grad_norm": 0.006673421245068312, + "learning_rate": 0.2963385224681196, + "loss": 0.1732, + "num_input_tokens_seen": 2550080, + "step": 2820 + }, + { + "epoch": 0.7456777088557477, + "grad_norm": 0.006419319659471512, + "learning_rate": 0.29632557571736556, + "loss": 0.1644, + "num_input_tokens_seen": 2554720, + "step": 2825 + }, + { + "epoch": 0.7469974924112446, + "grad_norm": 0.010875841602683067, + "learning_rate": 0.2963126064013689, + "loss": 0.1357, + "num_input_tokens_seen": 2559200, + "step": 2830 + }, + { + "epoch": 0.7483172759667415, + "grad_norm": 0.006546637509018183, + "learning_rate": 0.29629961452212966, + "loss": 0.1582, + "num_input_tokens_seen": 2563392, + "step": 2835 + }, + { + "epoch": 0.7496370595222384, + "grad_norm": 0.007879581302404404, + "learning_rate": 0.2962866000816513, + "loss": 0.1339, + "num_input_tokens_seen": 2567648, + "step": 2840 + }, + { + "epoch": 0.7509568430777352, + "grad_norm": 0.008552614599466324, + "learning_rate": 0.2962735630819409, + "loss": 0.1355, + "num_input_tokens_seen": 2572096, + "step": 2845 + }, + { + "epoch": 0.7522766266332321, + "grad_norm": 0.014765430241823196, + "learning_rate": 0.2962605035250089, + "loss": 0.1863, + "num_input_tokens_seen": 2576736, + "step": 2850 + }, + { + "epoch": 0.753596410188729, + "grad_norm": 0.01746387407183647, + "learning_rate": 0.29624742141286914, + "loss": 0.1561, + "num_input_tokens_seen": 2581312, + "step": 2855 + }, + { + "epoch": 0.7549161937442259, + "grad_norm": 0.005951301660388708, + "learning_rate": 0.29623431674753925, + "loss": 0.1251, + "num_input_tokens_seen": 2586016, + "step": 2860 + }, + { + "epoch": 0.7562359772997228, + "grad_norm": 0.004973443690687418, + "learning_rate": 0.29622118953103993, + "loss": 0.135, + "num_input_tokens_seen": 2590464, + "step": 2865 + }, + { + "epoch": 0.7575557608552197, + "grad_norm": 0.005592306610196829, + "learning_rate": 0.2962080397653957, + "loss": 0.1315, + "num_input_tokens_seen": 2595040, + "step": 2870 + }, + { + "epoch": 0.7588755444107167, + "grad_norm": 0.005828335415571928, + "learning_rate": 0.29619486745263435, + "loss": 0.1343, + "num_input_tokens_seen": 2599616, + "step": 2875 + }, + { + "epoch": 0.7601953279662136, + "grad_norm": 0.012083166278898716, + "learning_rate": 0.2961816725947873, + "loss": 0.1002, + "num_input_tokens_seen": 2603968, + "step": 2880 + }, + { + "epoch": 0.7615151115217105, + "grad_norm": 0.010511679574847221, + "learning_rate": 0.29616845519388924, + "loss": 0.1539, + "num_input_tokens_seen": 2608384, + "step": 2885 + }, + { + "epoch": 0.7628348950772074, + "grad_norm": 0.009000366553664207, + "learning_rate": 0.2961552152519785, + "loss": 0.1462, + "num_input_tokens_seen": 2613056, + "step": 2890 + }, + { + "epoch": 0.7641546786327043, + "grad_norm": 0.011046675965189934, + "learning_rate": 0.29614195277109695, + "loss": 0.1306, + "num_input_tokens_seen": 2617760, + "step": 2895 + }, + { + "epoch": 0.7654744621882011, + "grad_norm": 0.014341295696794987, + "learning_rate": 0.2961286677532897, + "loss": 0.1367, + "num_input_tokens_seen": 2622720, + "step": 2900 + }, + { + "epoch": 0.766794245743698, + "grad_norm": 0.014643283560872078, + "learning_rate": 0.2961153602006055, + "loss": 0.1357, + "num_input_tokens_seen": 2627200, + "step": 2905 + }, + { + "epoch": 0.7681140292991949, + "grad_norm": 0.011149345897138119, + "learning_rate": 0.29610203011509656, + "loss": 0.1161, + "num_input_tokens_seen": 2631424, + "step": 2910 + }, + { + "epoch": 0.7694338128546918, + "grad_norm": 0.006814662832766771, + "learning_rate": 0.29608867749881856, + "loss": 0.1113, + "num_input_tokens_seen": 2636032, + "step": 2915 + }, + { + "epoch": 0.7707535964101887, + "grad_norm": 0.011089387349784374, + "learning_rate": 0.29607530235383067, + "loss": 0.1038, + "num_input_tokens_seen": 2640544, + "step": 2920 + }, + { + "epoch": 0.7720733799656856, + "grad_norm": 0.011054376140236855, + "learning_rate": 0.2960619046821954, + "loss": 0.1211, + "num_input_tokens_seen": 2645056, + "step": 2925 + }, + { + "epoch": 0.7733931635211825, + "grad_norm": 0.01198354922235012, + "learning_rate": 0.2960484844859789, + "loss": 0.0878, + "num_input_tokens_seen": 2649632, + "step": 2930 + }, + { + "epoch": 0.7747129470766794, + "grad_norm": 0.016130516305565834, + "learning_rate": 0.29603504176725076, + "loss": 0.1505, + "num_input_tokens_seen": 2654336, + "step": 2935 + }, + { + "epoch": 0.7760327306321764, + "grad_norm": 0.013262863270938396, + "learning_rate": 0.296021576528084, + "loss": 0.1852, + "num_input_tokens_seen": 2659040, + "step": 2940 + }, + { + "epoch": 0.7773525141876733, + "grad_norm": 0.005065008997917175, + "learning_rate": 0.29600808877055507, + "loss": 0.141, + "num_input_tokens_seen": 2663872, + "step": 2945 + }, + { + "epoch": 0.7786722977431701, + "grad_norm": 0.005682351533323526, + "learning_rate": 0.29599457849674404, + "loss": 0.1372, + "num_input_tokens_seen": 2668608, + "step": 2950 + }, + { + "epoch": 0.779992081298667, + "grad_norm": 0.00856239628046751, + "learning_rate": 0.2959810457087343, + "loss": 0.1431, + "num_input_tokens_seen": 2673248, + "step": 2955 + }, + { + "epoch": 0.7813118648541639, + "grad_norm": 0.007640113588422537, + "learning_rate": 0.2959674904086128, + "loss": 0.136, + "num_input_tokens_seen": 2677856, + "step": 2960 + }, + { + "epoch": 0.7826316484096608, + "grad_norm": 0.00942615419626236, + "learning_rate": 0.2959539125984699, + "loss": 0.1346, + "num_input_tokens_seen": 2682272, + "step": 2965 + }, + { + "epoch": 0.7839514319651577, + "grad_norm": 0.008096102625131607, + "learning_rate": 0.2959403122803996, + "loss": 0.1272, + "num_input_tokens_seen": 2686720, + "step": 2970 + }, + { + "epoch": 0.7852712155206546, + "grad_norm": 0.010244416072964668, + "learning_rate": 0.2959266894564991, + "loss": 0.1324, + "num_input_tokens_seen": 2691488, + "step": 2975 + }, + { + "epoch": 0.7865909990761515, + "grad_norm": 0.012211731635034084, + "learning_rate": 0.2959130441288692, + "loss": 0.1547, + "num_input_tokens_seen": 2695968, + "step": 2980 + }, + { + "epoch": 0.7879107826316484, + "grad_norm": 0.008720414713025093, + "learning_rate": 0.2958993762996143, + "loss": 0.1079, + "num_input_tokens_seen": 2700352, + "step": 2985 + }, + { + "epoch": 0.7892305661871453, + "grad_norm": 0.009370431303977966, + "learning_rate": 0.2958856859708421, + "loss": 0.1179, + "num_input_tokens_seen": 2704768, + "step": 2990 + }, + { + "epoch": 0.7905503497426422, + "grad_norm": 0.014067264273762703, + "learning_rate": 0.2958719731446638, + "loss": 0.1484, + "num_input_tokens_seen": 2709440, + "step": 2995 + }, + { + "epoch": 0.7918701332981392, + "grad_norm": 0.007746594958007336, + "learning_rate": 0.29585823782319404, + "loss": 0.1363, + "num_input_tokens_seen": 2713888, + "step": 3000 + }, + { + "epoch": 0.7918701332981392, + "eval_loss": 0.1301281750202179, + "eval_runtime": 75.788, + "eval_samples_per_second": 88.866, + "eval_steps_per_second": 22.22, + "num_input_tokens_seen": 2713888, + "step": 3000 + }, + { + "epoch": 0.793189916853636, + "grad_norm": 0.009947994723916054, + "learning_rate": 0.2958444800085511, + "loss": 0.174, + "num_input_tokens_seen": 2718240, + "step": 3005 + }, + { + "epoch": 0.7945097004091329, + "grad_norm": 0.009074540808796883, + "learning_rate": 0.2958306997028565, + "loss": 0.1485, + "num_input_tokens_seen": 2722432, + "step": 3010 + }, + { + "epoch": 0.7958294839646298, + "grad_norm": 0.007781613618135452, + "learning_rate": 0.2958168969082354, + "loss": 0.1695, + "num_input_tokens_seen": 2726624, + "step": 3015 + }, + { + "epoch": 0.7971492675201267, + "grad_norm": 0.00848623551428318, + "learning_rate": 0.2958030716268164, + "loss": 0.1437, + "num_input_tokens_seen": 2731424, + "step": 3020 + }, + { + "epoch": 0.7984690510756236, + "grad_norm": 0.013623623177409172, + "learning_rate": 0.2957892238607314, + "loss": 0.1428, + "num_input_tokens_seen": 2736000, + "step": 3025 + }, + { + "epoch": 0.7997888346311205, + "grad_norm": 0.005332346074283123, + "learning_rate": 0.2957753536121161, + "loss": 0.1289, + "num_input_tokens_seen": 2740736, + "step": 3030 + }, + { + "epoch": 0.8011086181866174, + "grad_norm": 0.0042720152996480465, + "learning_rate": 0.29576146088310923, + "loss": 0.1233, + "num_input_tokens_seen": 2745024, + "step": 3035 + }, + { + "epoch": 0.8024284017421143, + "grad_norm": 0.008238707669079304, + "learning_rate": 0.2957475456758533, + "loss": 0.1411, + "num_input_tokens_seen": 2749632, + "step": 3040 + }, + { + "epoch": 0.8037481852976112, + "grad_norm": 0.0064841522835195065, + "learning_rate": 0.2957336079924944, + "loss": 0.1236, + "num_input_tokens_seen": 2754336, + "step": 3045 + }, + { + "epoch": 0.805067968853108, + "grad_norm": 0.007963722571730614, + "learning_rate": 0.2957196478351816, + "loss": 0.1407, + "num_input_tokens_seen": 2759040, + "step": 3050 + }, + { + "epoch": 0.8063877524086049, + "grad_norm": 0.012321335263550282, + "learning_rate": 0.295705665206068, + "loss": 0.1398, + "num_input_tokens_seen": 2763392, + "step": 3055 + }, + { + "epoch": 0.8077075359641019, + "grad_norm": 0.00795640330761671, + "learning_rate": 0.2956916601073097, + "loss": 0.1303, + "num_input_tokens_seen": 2768064, + "step": 3060 + }, + { + "epoch": 0.8090273195195988, + "grad_norm": 0.01757146790623665, + "learning_rate": 0.29567763254106655, + "loss": 0.1178, + "num_input_tokens_seen": 2772736, + "step": 3065 + }, + { + "epoch": 0.8103471030750957, + "grad_norm": 0.00796378031373024, + "learning_rate": 0.29566358250950175, + "loss": 0.1441, + "num_input_tokens_seen": 2776992, + "step": 3070 + }, + { + "epoch": 0.8116668866305926, + "grad_norm": 0.01137225516140461, + "learning_rate": 0.295649510014782, + "loss": 0.1171, + "num_input_tokens_seen": 2781536, + "step": 3075 + }, + { + "epoch": 0.8129866701860895, + "grad_norm": 0.015847286209464073, + "learning_rate": 0.2956354150590775, + "loss": 0.1462, + "num_input_tokens_seen": 2785920, + "step": 3080 + }, + { + "epoch": 0.8143064537415864, + "grad_norm": 0.008926480077207088, + "learning_rate": 0.2956212976445618, + "loss": 0.1228, + "num_input_tokens_seen": 2790528, + "step": 3085 + }, + { + "epoch": 0.8156262372970833, + "grad_norm": 0.017084192484617233, + "learning_rate": 0.295607157773412, + "loss": 0.1347, + "num_input_tokens_seen": 2795168, + "step": 3090 + }, + { + "epoch": 0.8169460208525802, + "grad_norm": 0.0069526731967926025, + "learning_rate": 0.2955929954478087, + "loss": 0.1981, + "num_input_tokens_seen": 2799488, + "step": 3095 + }, + { + "epoch": 0.8182658044080771, + "grad_norm": 0.006104796193540096, + "learning_rate": 0.29557881066993585, + "loss": 0.149, + "num_input_tokens_seen": 2804160, + "step": 3100 + }, + { + "epoch": 0.819585587963574, + "grad_norm": 0.0049378350377082825, + "learning_rate": 0.29556460344198093, + "loss": 0.1271, + "num_input_tokens_seen": 2808576, + "step": 3105 + }, + { + "epoch": 0.8209053715190708, + "grad_norm": 0.01396691519767046, + "learning_rate": 0.29555037376613486, + "loss": 0.1737, + "num_input_tokens_seen": 2813088, + "step": 3110 + }, + { + "epoch": 0.8222251550745677, + "grad_norm": 0.006857842672616243, + "learning_rate": 0.29553612164459203, + "loss": 0.1258, + "num_input_tokens_seen": 2817536, + "step": 3115 + }, + { + "epoch": 0.8235449386300646, + "grad_norm": 0.01132013089954853, + "learning_rate": 0.29552184707955037, + "loss": 0.1841, + "num_input_tokens_seen": 2821856, + "step": 3120 + }, + { + "epoch": 0.8248647221855616, + "grad_norm": 0.007232239004224539, + "learning_rate": 0.29550755007321117, + "loss": 0.1129, + "num_input_tokens_seen": 2825920, + "step": 3125 + }, + { + "epoch": 0.8261845057410585, + "grad_norm": 0.0044736661948263645, + "learning_rate": 0.29549323062777916, + "loss": 0.12, + "num_input_tokens_seen": 2830624, + "step": 3130 + }, + { + "epoch": 0.8275042892965554, + "grad_norm": 0.010373227298259735, + "learning_rate": 0.29547888874546263, + "loss": 0.1468, + "num_input_tokens_seen": 2835200, + "step": 3135 + }, + { + "epoch": 0.8288240728520523, + "grad_norm": 0.015811119228601456, + "learning_rate": 0.2954645244284732, + "loss": 0.1752, + "num_input_tokens_seen": 2839520, + "step": 3140 + }, + { + "epoch": 0.8301438564075492, + "grad_norm": 0.010147128254175186, + "learning_rate": 0.2954501376790261, + "loss": 0.1479, + "num_input_tokens_seen": 2844128, + "step": 3145 + }, + { + "epoch": 0.8314636399630461, + "grad_norm": 0.007448455318808556, + "learning_rate": 0.29543572849933997, + "loss": 0.1598, + "num_input_tokens_seen": 2848896, + "step": 3150 + }, + { + "epoch": 0.832783423518543, + "grad_norm": 0.007255926262587309, + "learning_rate": 0.2954212968916368, + "loss": 0.1152, + "num_input_tokens_seen": 2853216, + "step": 3155 + }, + { + "epoch": 0.8341032070740398, + "grad_norm": 0.005073729902505875, + "learning_rate": 0.29540684285814217, + "loss": 0.1081, + "num_input_tokens_seen": 2857760, + "step": 3160 + }, + { + "epoch": 0.8354229906295367, + "grad_norm": 0.008972669951617718, + "learning_rate": 0.2953923664010851, + "loss": 0.1174, + "num_input_tokens_seen": 2862176, + "step": 3165 + }, + { + "epoch": 0.8367427741850336, + "grad_norm": 0.007944876328110695, + "learning_rate": 0.295377867522698, + "loss": 0.1137, + "num_input_tokens_seen": 2866816, + "step": 3170 + }, + { + "epoch": 0.8380625577405305, + "grad_norm": 0.010569440200924873, + "learning_rate": 0.2953633462252168, + "loss": 0.1314, + "num_input_tokens_seen": 2871264, + "step": 3175 + }, + { + "epoch": 0.8393823412960274, + "grad_norm": 0.003876127302646637, + "learning_rate": 0.2953488025108809, + "loss": 0.0848, + "num_input_tokens_seen": 2875936, + "step": 3180 + }, + { + "epoch": 0.8407021248515244, + "grad_norm": 0.007948247715830803, + "learning_rate": 0.295334236381933, + "loss": 0.1019, + "num_input_tokens_seen": 2880576, + "step": 3185 + }, + { + "epoch": 0.8420219084070213, + "grad_norm": 0.009549833834171295, + "learning_rate": 0.29531964784061954, + "loss": 0.1457, + "num_input_tokens_seen": 2884896, + "step": 3190 + }, + { + "epoch": 0.8433416919625182, + "grad_norm": 0.011005191132426262, + "learning_rate": 0.2953050368891902, + "loss": 0.1313, + "num_input_tokens_seen": 2889664, + "step": 3195 + }, + { + "epoch": 0.8446614755180151, + "grad_norm": 0.011474990285933018, + "learning_rate": 0.29529040352989805, + "loss": 0.0991, + "num_input_tokens_seen": 2894304, + "step": 3200 + }, + { + "epoch": 0.8446614755180151, + "eval_loss": 0.15528038144111633, + "eval_runtime": 75.8575, + "eval_samples_per_second": 88.785, + "eval_steps_per_second": 22.2, + "num_input_tokens_seen": 2894304, + "step": 3200 + }, + { + "epoch": 0.845981259073512, + "grad_norm": 0.00757970567792654, + "learning_rate": 0.29527574776499993, + "loss": 0.1428, + "num_input_tokens_seen": 2899232, + "step": 3205 + }, + { + "epoch": 0.8473010426290088, + "grad_norm": 0.01368318684399128, + "learning_rate": 0.2952610695967558, + "loss": 0.1303, + "num_input_tokens_seen": 2903968, + "step": 3210 + }, + { + "epoch": 0.8486208261845057, + "grad_norm": 0.005612476263195276, + "learning_rate": 0.29524636902742935, + "loss": 0.1366, + "num_input_tokens_seen": 2908576, + "step": 3215 + }, + { + "epoch": 0.8499406097400026, + "grad_norm": 0.013205084018409252, + "learning_rate": 0.2952316460592875, + "loss": 0.1346, + "num_input_tokens_seen": 2913120, + "step": 3220 + }, + { + "epoch": 0.8512603932954995, + "grad_norm": 0.011840049177408218, + "learning_rate": 0.29521690069460066, + "loss": 0.153, + "num_input_tokens_seen": 2917280, + "step": 3225 + }, + { + "epoch": 0.8525801768509964, + "grad_norm": 0.0049454341642558575, + "learning_rate": 0.29520213293564285, + "loss": 0.1338, + "num_input_tokens_seen": 2921792, + "step": 3230 + }, + { + "epoch": 0.8538999604064933, + "grad_norm": 0.010351860895752907, + "learning_rate": 0.29518734278469144, + "loss": 0.1291, + "num_input_tokens_seen": 2926208, + "step": 3235 + }, + { + "epoch": 0.8552197439619902, + "grad_norm": 0.006408391986042261, + "learning_rate": 0.29517253024402723, + "loss": 0.1002, + "num_input_tokens_seen": 2930848, + "step": 3240 + }, + { + "epoch": 0.8565395275174872, + "grad_norm": 0.0044602034613490105, + "learning_rate": 0.2951576953159345, + "loss": 0.083, + "num_input_tokens_seen": 2935104, + "step": 3245 + }, + { + "epoch": 0.8578593110729841, + "grad_norm": 0.006727683357894421, + "learning_rate": 0.29514283800270097, + "loss": 0.1594, + "num_input_tokens_seen": 2939584, + "step": 3250 + }, + { + "epoch": 0.859179094628481, + "grad_norm": 0.0074065076187253, + "learning_rate": 0.2951279583066179, + "loss": 0.106, + "num_input_tokens_seen": 2943968, + "step": 3255 + }, + { + "epoch": 0.8604988781839779, + "grad_norm": 0.01000774186104536, + "learning_rate": 0.2951130562299798, + "loss": 0.1175, + "num_input_tokens_seen": 2948320, + "step": 3260 + }, + { + "epoch": 0.8618186617394747, + "grad_norm": 0.005520226433873177, + "learning_rate": 0.29509813177508487, + "loss": 0.0725, + "num_input_tokens_seen": 2952832, + "step": 3265 + }, + { + "epoch": 0.8631384452949716, + "grad_norm": 0.014085515402257442, + "learning_rate": 0.2950831849442346, + "loss": 0.1765, + "num_input_tokens_seen": 2957696, + "step": 3270 + }, + { + "epoch": 0.8644582288504685, + "grad_norm": 0.010751460678875446, + "learning_rate": 0.2950682157397339, + "loss": 0.1281, + "num_input_tokens_seen": 2962080, + "step": 3275 + }, + { + "epoch": 0.8657780124059654, + "grad_norm": 0.010059507563710213, + "learning_rate": 0.2950532241638914, + "loss": 0.1781, + "num_input_tokens_seen": 2966624, + "step": 3280 + }, + { + "epoch": 0.8670977959614623, + "grad_norm": 0.012009276077151299, + "learning_rate": 0.2950382102190188, + "loss": 0.1292, + "num_input_tokens_seen": 2971296, + "step": 3285 + }, + { + "epoch": 0.8684175795169592, + "grad_norm": 0.005912376567721367, + "learning_rate": 0.2950231739074316, + "loss": 0.1313, + "num_input_tokens_seen": 2975776, + "step": 3290 + }, + { + "epoch": 0.8697373630724561, + "grad_norm": 0.008822551928460598, + "learning_rate": 0.29500811523144843, + "loss": 0.1564, + "num_input_tokens_seen": 2980096, + "step": 3295 + }, + { + "epoch": 0.871057146627953, + "grad_norm": 0.017678318545222282, + "learning_rate": 0.2949930341933917, + "loss": 0.1859, + "num_input_tokens_seen": 2984736, + "step": 3300 + }, + { + "epoch": 0.8723769301834499, + "grad_norm": 0.009252721443772316, + "learning_rate": 0.29497793079558693, + "loss": 0.1038, + "num_input_tokens_seen": 2989216, + "step": 3305 + }, + { + "epoch": 0.8736967137389469, + "grad_norm": 0.015080735087394714, + "learning_rate": 0.2949628050403633, + "loss": 0.145, + "num_input_tokens_seen": 2993632, + "step": 3310 + }, + { + "epoch": 0.8750164972944438, + "grad_norm": 0.007107872981578112, + "learning_rate": 0.2949476569300535, + "loss": 0.1279, + "num_input_tokens_seen": 2997984, + "step": 3315 + }, + { + "epoch": 0.8763362808499406, + "grad_norm": 0.010668512433767319, + "learning_rate": 0.29493248646699344, + "loss": 0.1407, + "num_input_tokens_seen": 3002720, + "step": 3320 + }, + { + "epoch": 0.8776560644054375, + "grad_norm": 0.012422683648765087, + "learning_rate": 0.29491729365352265, + "loss": 0.1577, + "num_input_tokens_seen": 3007584, + "step": 3325 + }, + { + "epoch": 0.8789758479609344, + "grad_norm": 0.008095539174973965, + "learning_rate": 0.29490207849198397, + "loss": 0.1826, + "num_input_tokens_seen": 3012288, + "step": 3330 + }, + { + "epoch": 0.8802956315164313, + "grad_norm": 0.010500083677470684, + "learning_rate": 0.29488684098472384, + "loss": 0.1279, + "num_input_tokens_seen": 3017088, + "step": 3335 + }, + { + "epoch": 0.8816154150719282, + "grad_norm": 0.006518995855003595, + "learning_rate": 0.2948715811340921, + "loss": 0.1058, + "num_input_tokens_seen": 3021536, + "step": 3340 + }, + { + "epoch": 0.8829351986274251, + "grad_norm": 0.015593232586979866, + "learning_rate": 0.294856298942442, + "loss": 0.1461, + "num_input_tokens_seen": 3026144, + "step": 3345 + }, + { + "epoch": 0.884254982182922, + "grad_norm": 0.007043647579848766, + "learning_rate": 0.2948409944121302, + "loss": 0.1018, + "num_input_tokens_seen": 3030848, + "step": 3350 + }, + { + "epoch": 0.8855747657384189, + "grad_norm": 0.009188278578221798, + "learning_rate": 0.29482566754551687, + "loss": 0.1163, + "num_input_tokens_seen": 3035488, + "step": 3355 + }, + { + "epoch": 0.8868945492939158, + "grad_norm": 0.00910679530352354, + "learning_rate": 0.2948103183449656, + "loss": 0.1714, + "num_input_tokens_seen": 3039904, + "step": 3360 + }, + { + "epoch": 0.8882143328494126, + "grad_norm": 0.005438362248241901, + "learning_rate": 0.2947949468128435, + "loss": 0.1371, + "num_input_tokens_seen": 3044832, + "step": 3365 + }, + { + "epoch": 0.8895341164049096, + "grad_norm": 0.008713318035006523, + "learning_rate": 0.2947795529515209, + "loss": 0.1417, + "num_input_tokens_seen": 3049536, + "step": 3370 + }, + { + "epoch": 0.8908538999604065, + "grad_norm": 0.005301532335579395, + "learning_rate": 0.29476413676337193, + "loss": 0.1668, + "num_input_tokens_seen": 3054048, + "step": 3375 + }, + { + "epoch": 0.8921736835159034, + "grad_norm": 0.004323055502027273, + "learning_rate": 0.2947486982507738, + "loss": 0.1362, + "num_input_tokens_seen": 3058432, + "step": 3380 + }, + { + "epoch": 0.8934934670714003, + "grad_norm": 0.010175890289247036, + "learning_rate": 0.29473323741610735, + "loss": 0.1287, + "num_input_tokens_seen": 3062976, + "step": 3385 + }, + { + "epoch": 0.8948132506268972, + "grad_norm": 0.01056748442351818, + "learning_rate": 0.2947177542617569, + "loss": 0.1186, + "num_input_tokens_seen": 3067584, + "step": 3390 + }, + { + "epoch": 0.8961330341823941, + "grad_norm": 0.009103585965931416, + "learning_rate": 0.2947022487901101, + "loss": 0.1647, + "num_input_tokens_seen": 3071968, + "step": 3395 + }, + { + "epoch": 0.897452817737891, + "grad_norm": 0.005193828139454126, + "learning_rate": 0.2946867210035581, + "loss": 0.1103, + "num_input_tokens_seen": 3076640, + "step": 3400 + }, + { + "epoch": 0.897452817737891, + "eval_loss": 0.13119037449359894, + "eval_runtime": 75.7122, + "eval_samples_per_second": 88.955, + "eval_steps_per_second": 22.242, + "num_input_tokens_seen": 3076640, + "step": 3400 + }, + { + "epoch": 0.8987726012933879, + "grad_norm": 0.007276215124875307, + "learning_rate": 0.2946711709044954, + "loss": 0.1355, + "num_input_tokens_seen": 3080832, + "step": 3405 + }, + { + "epoch": 0.9000923848488848, + "grad_norm": 0.010053726844489574, + "learning_rate": 0.2946555984953202, + "loss": 0.1316, + "num_input_tokens_seen": 3085344, + "step": 3410 + }, + { + "epoch": 0.9014121684043817, + "grad_norm": 0.0052037471905350685, + "learning_rate": 0.2946400037784338, + "loss": 0.078, + "num_input_tokens_seen": 3089792, + "step": 3415 + }, + { + "epoch": 0.9027319519598785, + "grad_norm": 0.0067391409538686275, + "learning_rate": 0.29462438675624114, + "loss": 0.1209, + "num_input_tokens_seen": 3094496, + "step": 3420 + }, + { + "epoch": 0.9040517355153754, + "grad_norm": 0.010378103703260422, + "learning_rate": 0.2946087474311506, + "loss": 0.1537, + "num_input_tokens_seen": 3099296, + "step": 3425 + }, + { + "epoch": 0.9053715190708723, + "grad_norm": 0.006824664305895567, + "learning_rate": 0.294593085805574, + "loss": 0.1057, + "num_input_tokens_seen": 3103424, + "step": 3430 + }, + { + "epoch": 0.9066913026263693, + "grad_norm": 0.012753785587847233, + "learning_rate": 0.2945774018819264, + "loss": 0.1706, + "num_input_tokens_seen": 3107904, + "step": 3435 + }, + { + "epoch": 0.9080110861818662, + "grad_norm": 0.005768671166151762, + "learning_rate": 0.2945616956626266, + "loss": 0.1218, + "num_input_tokens_seen": 3112416, + "step": 3440 + }, + { + "epoch": 0.9093308697373631, + "grad_norm": 0.00829215906560421, + "learning_rate": 0.2945459671500966, + "loss": 0.1063, + "num_input_tokens_seen": 3116800, + "step": 3445 + }, + { + "epoch": 0.91065065329286, + "grad_norm": 0.00923560094088316, + "learning_rate": 0.2945302163467621, + "loss": 0.164, + "num_input_tokens_seen": 3121536, + "step": 3450 + }, + { + "epoch": 0.9119704368483569, + "grad_norm": 0.010305834002792835, + "learning_rate": 0.2945144432550519, + "loss": 0.1411, + "num_input_tokens_seen": 3126112, + "step": 3455 + }, + { + "epoch": 0.9132902204038538, + "grad_norm": 0.008106628432869911, + "learning_rate": 0.29449864787739843, + "loss": 0.1778, + "num_input_tokens_seen": 3130688, + "step": 3460 + }, + { + "epoch": 0.9146100039593507, + "grad_norm": 0.005430615972727537, + "learning_rate": 0.2944828302162376, + "loss": 0.1626, + "num_input_tokens_seen": 3135008, + "step": 3465 + }, + { + "epoch": 0.9159297875148475, + "grad_norm": 0.00680575892329216, + "learning_rate": 0.2944669902740087, + "loss": 0.1275, + "num_input_tokens_seen": 3139296, + "step": 3470 + }, + { + "epoch": 0.9172495710703444, + "grad_norm": 0.008168618194758892, + "learning_rate": 0.2944511280531544, + "loss": 0.1678, + "num_input_tokens_seen": 3143872, + "step": 3475 + }, + { + "epoch": 0.9185693546258413, + "grad_norm": 0.010683761909604073, + "learning_rate": 0.29443524355612083, + "loss": 0.1389, + "num_input_tokens_seen": 3148448, + "step": 3480 + }, + { + "epoch": 0.9198891381813382, + "grad_norm": 0.005626813042908907, + "learning_rate": 0.29441933678535764, + "loss": 0.1659, + "num_input_tokens_seen": 3153280, + "step": 3485 + }, + { + "epoch": 0.9212089217368351, + "grad_norm": 0.00516707357019186, + "learning_rate": 0.29440340774331786, + "loss": 0.1689, + "num_input_tokens_seen": 3157952, + "step": 3490 + }, + { + "epoch": 0.9225287052923321, + "grad_norm": 0.013469919562339783, + "learning_rate": 0.2943874564324579, + "loss": 0.1646, + "num_input_tokens_seen": 3162336, + "step": 3495 + }, + { + "epoch": 0.923848488847829, + "grad_norm": 0.007430600933730602, + "learning_rate": 0.2943714828552376, + "loss": 0.1162, + "num_input_tokens_seen": 3166944, + "step": 3500 + }, + { + "epoch": 0.9251682724033259, + "grad_norm": 0.008874347433447838, + "learning_rate": 0.29435548701412045, + "loss": 0.19, + "num_input_tokens_seen": 3171360, + "step": 3505 + }, + { + "epoch": 0.9264880559588228, + "grad_norm": 0.00817199982702732, + "learning_rate": 0.2943394689115731, + "loss": 0.1192, + "num_input_tokens_seen": 3175936, + "step": 3510 + }, + { + "epoch": 0.9278078395143197, + "grad_norm": 0.0067103165201842785, + "learning_rate": 0.29432342855006577, + "loss": 0.1524, + "num_input_tokens_seen": 3180544, + "step": 3515 + }, + { + "epoch": 0.9291276230698166, + "grad_norm": 0.007937805727124214, + "learning_rate": 0.294307365932072, + "loss": 0.1352, + "num_input_tokens_seen": 3185184, + "step": 3520 + }, + { + "epoch": 0.9304474066253134, + "grad_norm": 0.009375336579978466, + "learning_rate": 0.294291281060069, + "loss": 0.1499, + "num_input_tokens_seen": 3189760, + "step": 3525 + }, + { + "epoch": 0.9317671901808103, + "grad_norm": 0.006220443639904261, + "learning_rate": 0.29427517393653724, + "loss": 0.1282, + "num_input_tokens_seen": 3194016, + "step": 3530 + }, + { + "epoch": 0.9330869737363072, + "grad_norm": 0.0060377283953130245, + "learning_rate": 0.29425904456396046, + "loss": 0.1125, + "num_input_tokens_seen": 3198368, + "step": 3535 + }, + { + "epoch": 0.9344067572918041, + "grad_norm": 0.009709720499813557, + "learning_rate": 0.2942428929448262, + "loss": 0.1382, + "num_input_tokens_seen": 3203168, + "step": 3540 + }, + { + "epoch": 0.935726540847301, + "grad_norm": 0.008693024516105652, + "learning_rate": 0.2942267190816252, + "loss": 0.1192, + "num_input_tokens_seen": 3207616, + "step": 3545 + }, + { + "epoch": 0.9370463244027979, + "grad_norm": 0.017346786335110664, + "learning_rate": 0.2942105229768516, + "loss": 0.1844, + "num_input_tokens_seen": 3212512, + "step": 3550 + }, + { + "epoch": 0.9383661079582949, + "grad_norm": 0.01304710004478693, + "learning_rate": 0.29419430463300306, + "loss": 0.1657, + "num_input_tokens_seen": 3216960, + "step": 3555 + }, + { + "epoch": 0.9396858915137918, + "grad_norm": 0.0048039634712040424, + "learning_rate": 0.2941780640525808, + "loss": 0.146, + "num_input_tokens_seen": 3221312, + "step": 3560 + }, + { + "epoch": 0.9410056750692887, + "grad_norm": 0.006913173943758011, + "learning_rate": 0.2941618012380891, + "loss": 0.1155, + "num_input_tokens_seen": 3225888, + "step": 3565 + }, + { + "epoch": 0.9423254586247856, + "grad_norm": 0.004686306696385145, + "learning_rate": 0.29414551619203605, + "loss": 0.1067, + "num_input_tokens_seen": 3230304, + "step": 3570 + }, + { + "epoch": 0.9436452421802825, + "grad_norm": 0.009047511965036392, + "learning_rate": 0.29412920891693295, + "loss": 0.1561, + "num_input_tokens_seen": 3234816, + "step": 3575 + }, + { + "epoch": 0.9449650257357793, + "grad_norm": 0.009381339885294437, + "learning_rate": 0.2941128794152946, + "loss": 0.1492, + "num_input_tokens_seen": 3239424, + "step": 3580 + }, + { + "epoch": 0.9462848092912762, + "grad_norm": 0.009593325667083263, + "learning_rate": 0.2940965276896392, + "loss": 0.1512, + "num_input_tokens_seen": 3244032, + "step": 3585 + }, + { + "epoch": 0.9476045928467731, + "grad_norm": 0.008440257981419563, + "learning_rate": 0.2940801537424884, + "loss": 0.0985, + "num_input_tokens_seen": 3248480, + "step": 3590 + }, + { + "epoch": 0.94892437640227, + "grad_norm": 0.008803827688097954, + "learning_rate": 0.2940637575763673, + "loss": 0.1164, + "num_input_tokens_seen": 3252992, + "step": 3595 + }, + { + "epoch": 0.9502441599577669, + "grad_norm": 0.006515044718980789, + "learning_rate": 0.2940473391938043, + "loss": 0.1415, + "num_input_tokens_seen": 3257632, + "step": 3600 + }, + { + "epoch": 0.9502441599577669, + "eval_loss": 0.1360039860010147, + "eval_runtime": 75.8719, + "eval_samples_per_second": 88.768, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 3257632, + "step": 3600 + }, + { + "epoch": 0.9515639435132638, + "grad_norm": 0.008967024274170399, + "learning_rate": 0.29403089859733145, + "loss": 0.1345, + "num_input_tokens_seen": 3262176, + "step": 3605 + }, + { + "epoch": 0.9528837270687607, + "grad_norm": 0.005090136080980301, + "learning_rate": 0.294014435789484, + "loss": 0.0946, + "num_input_tokens_seen": 3266464, + "step": 3610 + }, + { + "epoch": 0.9542035106242576, + "grad_norm": 0.007506897207349539, + "learning_rate": 0.2939979507728007, + "loss": 0.1441, + "num_input_tokens_seen": 3270880, + "step": 3615 + }, + { + "epoch": 0.9555232941797546, + "grad_norm": 0.0057616084814071655, + "learning_rate": 0.2939814435498239, + "loss": 0.1095, + "num_input_tokens_seen": 3275840, + "step": 3620 + }, + { + "epoch": 0.9568430777352515, + "grad_norm": 0.006551623810082674, + "learning_rate": 0.29396491412309905, + "loss": 0.1083, + "num_input_tokens_seen": 3280064, + "step": 3625 + }, + { + "epoch": 0.9581628612907483, + "grad_norm": 0.0050118756480515, + "learning_rate": 0.2939483624951753, + "loss": 0.105, + "num_input_tokens_seen": 3284352, + "step": 3630 + }, + { + "epoch": 0.9594826448462452, + "grad_norm": 0.009699489921331406, + "learning_rate": 0.2939317886686051, + "loss": 0.1227, + "num_input_tokens_seen": 3288992, + "step": 3635 + }, + { + "epoch": 0.9608024284017421, + "grad_norm": 0.005329880863428116, + "learning_rate": 0.2939151926459443, + "loss": 0.1551, + "num_input_tokens_seen": 3293760, + "step": 3640 + }, + { + "epoch": 0.962122211957239, + "grad_norm": 0.008291354402899742, + "learning_rate": 0.2938985744297522, + "loss": 0.1265, + "num_input_tokens_seen": 3298304, + "step": 3645 + }, + { + "epoch": 0.9634419955127359, + "grad_norm": 0.008008227683603764, + "learning_rate": 0.29388193402259166, + "loss": 0.1359, + "num_input_tokens_seen": 3303232, + "step": 3650 + }, + { + "epoch": 0.9647617790682328, + "grad_norm": 0.007714154198765755, + "learning_rate": 0.29386527142702873, + "loss": 0.1215, + "num_input_tokens_seen": 3307872, + "step": 3655 + }, + { + "epoch": 0.9660815626237297, + "grad_norm": 0.008206400088965893, + "learning_rate": 0.293848586645633, + "loss": 0.134, + "num_input_tokens_seen": 3312608, + "step": 3660 + }, + { + "epoch": 0.9674013461792266, + "grad_norm": 0.0069074020721018314, + "learning_rate": 0.2938318796809775, + "loss": 0.1555, + "num_input_tokens_seen": 3317152, + "step": 3665 + }, + { + "epoch": 0.9687211297347235, + "grad_norm": 0.005222322419285774, + "learning_rate": 0.29381515053563867, + "loss": 0.1275, + "num_input_tokens_seen": 3321440, + "step": 3670 + }, + { + "epoch": 0.9700409132902204, + "grad_norm": 0.008765913546085358, + "learning_rate": 0.29379839921219636, + "loss": 0.1225, + "num_input_tokens_seen": 3326080, + "step": 3675 + }, + { + "epoch": 0.9713606968457174, + "grad_norm": 0.005349337123334408, + "learning_rate": 0.2937816257132338, + "loss": 0.0936, + "num_input_tokens_seen": 3330464, + "step": 3680 + }, + { + "epoch": 0.9726804804012142, + "grad_norm": 0.0050455559976398945, + "learning_rate": 0.2937648300413376, + "loss": 0.0914, + "num_input_tokens_seen": 3334784, + "step": 3685 + }, + { + "epoch": 0.9740002639567111, + "grad_norm": 0.005151690915226936, + "learning_rate": 0.293748012199098, + "loss": 0.1111, + "num_input_tokens_seen": 3339168, + "step": 3690 + }, + { + "epoch": 0.975320047512208, + "grad_norm": 0.009630299173295498, + "learning_rate": 0.29373117218910844, + "loss": 0.122, + "num_input_tokens_seen": 3343840, + "step": 3695 + }, + { + "epoch": 0.9766398310677049, + "grad_norm": 0.00534116942435503, + "learning_rate": 0.2937143100139659, + "loss": 0.165, + "num_input_tokens_seen": 3348448, + "step": 3700 + }, + { + "epoch": 0.9779596146232018, + "grad_norm": 0.0073779914528131485, + "learning_rate": 0.29369742567627083, + "loss": 0.0933, + "num_input_tokens_seen": 3352992, + "step": 3705 + }, + { + "epoch": 0.9792793981786987, + "grad_norm": 0.00671262014657259, + "learning_rate": 0.29368051917862675, + "loss": 0.1614, + "num_input_tokens_seen": 3357504, + "step": 3710 + }, + { + "epoch": 0.9805991817341956, + "grad_norm": 0.008294402621686459, + "learning_rate": 0.2936635905236411, + "loss": 0.1618, + "num_input_tokens_seen": 3362144, + "step": 3715 + }, + { + "epoch": 0.9819189652896925, + "grad_norm": 0.003511708229780197, + "learning_rate": 0.2936466397139244, + "loss": 0.1447, + "num_input_tokens_seen": 3366464, + "step": 3720 + }, + { + "epoch": 0.9832387488451894, + "grad_norm": 0.004849019926041365, + "learning_rate": 0.2936296667520907, + "loss": 0.1554, + "num_input_tokens_seen": 3370912, + "step": 3725 + }, + { + "epoch": 0.9845585324006862, + "grad_norm": 0.007518416736274958, + "learning_rate": 0.2936126716407574, + "loss": 0.1236, + "num_input_tokens_seen": 3375264, + "step": 3730 + }, + { + "epoch": 0.9858783159561831, + "grad_norm": 0.004730489570647478, + "learning_rate": 0.29359565438254537, + "loss": 0.1019, + "num_input_tokens_seen": 3379424, + "step": 3735 + }, + { + "epoch": 0.9871980995116801, + "grad_norm": 0.008820139802992344, + "learning_rate": 0.29357861498007887, + "loss": 0.1667, + "num_input_tokens_seen": 3383840, + "step": 3740 + }, + { + "epoch": 0.988517883067177, + "grad_norm": 0.00659569725394249, + "learning_rate": 0.29356155343598567, + "loss": 0.1193, + "num_input_tokens_seen": 3388192, + "step": 3745 + }, + { + "epoch": 0.9898376666226739, + "grad_norm": 0.007531050592660904, + "learning_rate": 0.2935444697528968, + "loss": 0.1681, + "num_input_tokens_seen": 3392512, + "step": 3750 + }, + { + "epoch": 0.9911574501781708, + "grad_norm": 0.006325745023787022, + "learning_rate": 0.2935273639334468, + "loss": 0.0968, + "num_input_tokens_seen": 3397088, + "step": 3755 + }, + { + "epoch": 0.9924772337336677, + "grad_norm": 0.006096914876252413, + "learning_rate": 0.29351023598027365, + "loss": 0.1278, + "num_input_tokens_seen": 3401632, + "step": 3760 + }, + { + "epoch": 0.9937970172891646, + "grad_norm": 0.0062506734393537045, + "learning_rate": 0.2934930858960186, + "loss": 0.1357, + "num_input_tokens_seen": 3406240, + "step": 3765 + }, + { + "epoch": 0.9951168008446615, + "grad_norm": 0.0033460594713687897, + "learning_rate": 0.29347591368332643, + "loss": 0.1344, + "num_input_tokens_seen": 3410752, + "step": 3770 + }, + { + "epoch": 0.9964365844001584, + "grad_norm": 0.0068181101232767105, + "learning_rate": 0.2934587193448454, + "loss": 0.1084, + "num_input_tokens_seen": 3415040, + "step": 3775 + }, + { + "epoch": 0.9977563679556553, + "grad_norm": 0.010262629017233849, + "learning_rate": 0.29344150288322696, + "loss": 0.1821, + "num_input_tokens_seen": 3419552, + "step": 3780 + }, + { + "epoch": 0.9990761515111521, + "grad_norm": 0.007950671017169952, + "learning_rate": 0.2934242643011263, + "loss": 0.119, + "num_input_tokens_seen": 3423680, + "step": 3785 + }, + { + "epoch": 1.0002639567110994, + "grad_norm": 0.012550167739391327, + "learning_rate": 0.2934070036012016, + "loss": 0.1621, + "num_input_tokens_seen": 3427760, + "step": 3790 + }, + { + "epoch": 1.0015837402665964, + "grad_norm": 0.006963713094592094, + "learning_rate": 0.29338972078611475, + "loss": 0.1397, + "num_input_tokens_seen": 3432656, + "step": 3795 + }, + { + "epoch": 1.0029035238220931, + "grad_norm": 0.007249411195516586, + "learning_rate": 0.2933724158585311, + "loss": 0.107, + "num_input_tokens_seen": 3436976, + "step": 3800 + }, + { + "epoch": 1.0029035238220931, + "eval_loss": 0.13027693331241608, + "eval_runtime": 75.7434, + "eval_samples_per_second": 88.919, + "eval_steps_per_second": 22.233, + "num_input_tokens_seen": 3436976, + "step": 3800 + }, + { + "epoch": 1.0042233073775901, + "grad_norm": 0.006518450565636158, + "learning_rate": 0.29335508882111916, + "loss": 0.0995, + "num_input_tokens_seen": 3441488, + "step": 3805 + }, + { + "epoch": 1.005543090933087, + "grad_norm": 0.004728304222226143, + "learning_rate": 0.29333773967655097, + "loss": 0.1632, + "num_input_tokens_seen": 3446192, + "step": 3810 + }, + { + "epoch": 1.006862874488584, + "grad_norm": 0.006250473670661449, + "learning_rate": 0.2933203684275021, + "loss": 0.0979, + "num_input_tokens_seen": 3450800, + "step": 3815 + }, + { + "epoch": 1.0081826580440807, + "grad_norm": 0.008538099937140942, + "learning_rate": 0.2933029750766513, + "loss": 0.1164, + "num_input_tokens_seen": 3455184, + "step": 3820 + }, + { + "epoch": 1.0095024415995777, + "grad_norm": 0.004998205229640007, + "learning_rate": 0.2932855596266809, + "loss": 0.1449, + "num_input_tokens_seen": 3459632, + "step": 3825 + }, + { + "epoch": 1.0108222251550745, + "grad_norm": 0.00878540612757206, + "learning_rate": 0.2932681220802765, + "loss": 0.1563, + "num_input_tokens_seen": 3464208, + "step": 3830 + }, + { + "epoch": 1.0121420087105715, + "grad_norm": 0.005090650636702776, + "learning_rate": 0.2932506624401274, + "loss": 0.1069, + "num_input_tokens_seen": 3468816, + "step": 3835 + }, + { + "epoch": 1.0134617922660685, + "grad_norm": 0.008509375154972076, + "learning_rate": 0.29323318070892584, + "loss": 0.1158, + "num_input_tokens_seen": 3473584, + "step": 3840 + }, + { + "epoch": 1.0147815758215653, + "grad_norm": 0.011350620537996292, + "learning_rate": 0.29321567688936784, + "loss": 0.135, + "num_input_tokens_seen": 3478096, + "step": 3845 + }, + { + "epoch": 1.0161013593770623, + "grad_norm": 0.009013637900352478, + "learning_rate": 0.29319815098415275, + "loss": 0.1535, + "num_input_tokens_seen": 3482352, + "step": 3850 + }, + { + "epoch": 1.017421142932559, + "grad_norm": 0.006964719854295254, + "learning_rate": 0.2931806029959832, + "loss": 0.1346, + "num_input_tokens_seen": 3487024, + "step": 3855 + }, + { + "epoch": 1.018740926488056, + "grad_norm": 0.004754696507006884, + "learning_rate": 0.29316303292756535, + "loss": 0.1307, + "num_input_tokens_seen": 3491312, + "step": 3860 + }, + { + "epoch": 1.0200607100435528, + "grad_norm": 0.009167353622615337, + "learning_rate": 0.29314544078160876, + "loss": 0.118, + "num_input_tokens_seen": 3496240, + "step": 3865 + }, + { + "epoch": 1.0213804935990498, + "grad_norm": 0.007419477682560682, + "learning_rate": 0.2931278265608263, + "loss": 0.1505, + "num_input_tokens_seen": 3500816, + "step": 3870 + }, + { + "epoch": 1.0227002771545466, + "grad_norm": 0.0073678698390722275, + "learning_rate": 0.29311019026793433, + "loss": 0.1312, + "num_input_tokens_seen": 3505264, + "step": 3875 + }, + { + "epoch": 1.0240200607100436, + "grad_norm": 0.005273912567645311, + "learning_rate": 0.29309253190565254, + "loss": 0.1273, + "num_input_tokens_seen": 3509904, + "step": 3880 + }, + { + "epoch": 1.0253398442655404, + "grad_norm": 0.009371760301291943, + "learning_rate": 0.2930748514767042, + "loss": 0.0894, + "num_input_tokens_seen": 3514096, + "step": 3885 + }, + { + "epoch": 1.0266596278210374, + "grad_norm": 0.010474758222699165, + "learning_rate": 0.29305714898381574, + "loss": 0.0926, + "num_input_tokens_seen": 3518736, + "step": 3890 + }, + { + "epoch": 1.0279794113765341, + "grad_norm": 0.0037275587674230337, + "learning_rate": 0.29303942442971714, + "loss": 0.0567, + "num_input_tokens_seen": 3523152, + "step": 3895 + }, + { + "epoch": 1.0292991949320311, + "grad_norm": 0.009755988605320454, + "learning_rate": 0.2930216778171417, + "loss": 0.0948, + "num_input_tokens_seen": 3527600, + "step": 3900 + }, + { + "epoch": 1.0306189784875281, + "grad_norm": 0.010622287169098854, + "learning_rate": 0.2930039091488263, + "loss": 0.1193, + "num_input_tokens_seen": 3532272, + "step": 3905 + }, + { + "epoch": 1.031938762043025, + "grad_norm": 0.010489331558346748, + "learning_rate": 0.29298611842751093, + "loss": 0.1201, + "num_input_tokens_seen": 3536848, + "step": 3910 + }, + { + "epoch": 1.033258545598522, + "grad_norm": 0.008544831536710262, + "learning_rate": 0.29296830565593923, + "loss": 0.1307, + "num_input_tokens_seen": 3541552, + "step": 3915 + }, + { + "epoch": 1.0345783291540187, + "grad_norm": 0.015241253189742565, + "learning_rate": 0.2929504708368582, + "loss": 0.1738, + "num_input_tokens_seen": 3546032, + "step": 3920 + }, + { + "epoch": 1.0358981127095157, + "grad_norm": 0.005570739507675171, + "learning_rate": 0.29293261397301806, + "loss": 0.1615, + "num_input_tokens_seen": 3550512, + "step": 3925 + }, + { + "epoch": 1.0372178962650125, + "grad_norm": 0.006234465166926384, + "learning_rate": 0.29291473506717275, + "loss": 0.1389, + "num_input_tokens_seen": 3554768, + "step": 3930 + }, + { + "epoch": 1.0385376798205095, + "grad_norm": 0.008238298818469048, + "learning_rate": 0.29289683412207923, + "loss": 0.1376, + "num_input_tokens_seen": 3559536, + "step": 3935 + }, + { + "epoch": 1.0398574633760063, + "grad_norm": 0.00523471599444747, + "learning_rate": 0.29287891114049813, + "loss": 0.1103, + "num_input_tokens_seen": 3564016, + "step": 3940 + }, + { + "epoch": 1.0411772469315033, + "grad_norm": 0.006178983487188816, + "learning_rate": 0.29286096612519347, + "loss": 0.1401, + "num_input_tokens_seen": 3568720, + "step": 3945 + }, + { + "epoch": 1.042497030487, + "grad_norm": 0.006910347379744053, + "learning_rate": 0.2928429990789325, + "loss": 0.139, + "num_input_tokens_seen": 3573488, + "step": 3950 + }, + { + "epoch": 1.043816814042497, + "grad_norm": 0.00642067426815629, + "learning_rate": 0.29282501000448596, + "loss": 0.1253, + "num_input_tokens_seen": 3577968, + "step": 3955 + }, + { + "epoch": 1.0451365975979938, + "grad_norm": 0.005196591839194298, + "learning_rate": 0.2928069989046281, + "loss": 0.1068, + "num_input_tokens_seen": 3582320, + "step": 3960 + }, + { + "epoch": 1.0464563811534908, + "grad_norm": 0.004453133791685104, + "learning_rate": 0.2927889657821363, + "loss": 0.1084, + "num_input_tokens_seen": 3586448, + "step": 3965 + }, + { + "epoch": 1.0477761647089878, + "grad_norm": 0.00672508729621768, + "learning_rate": 0.2927709106397916, + "loss": 0.1007, + "num_input_tokens_seen": 3590896, + "step": 3970 + }, + { + "epoch": 1.0490959482644846, + "grad_norm": 0.00684950640425086, + "learning_rate": 0.29275283348037834, + "loss": 0.1325, + "num_input_tokens_seen": 3595824, + "step": 3975 + }, + { + "epoch": 1.0504157318199816, + "grad_norm": 0.005598016083240509, + "learning_rate": 0.29273473430668423, + "loss": 0.1143, + "num_input_tokens_seen": 3600112, + "step": 3980 + }, + { + "epoch": 1.0517355153754784, + "grad_norm": 0.00668564485386014, + "learning_rate": 0.2927166131215003, + "loss": 0.1628, + "num_input_tokens_seen": 3604816, + "step": 3985 + }, + { + "epoch": 1.0530552989309754, + "grad_norm": 0.013315638527274132, + "learning_rate": 0.2926984699276212, + "loss": 0.1433, + "num_input_tokens_seen": 3609520, + "step": 3990 + }, + { + "epoch": 1.0543750824864722, + "grad_norm": 0.0068726069293916225, + "learning_rate": 0.29268030472784473, + "loss": 0.1355, + "num_input_tokens_seen": 3613776, + "step": 3995 + }, + { + "epoch": 1.0556948660419692, + "grad_norm": 0.0063176462426781654, + "learning_rate": 0.2926621175249723, + "loss": 0.1665, + "num_input_tokens_seen": 3618672, + "step": 4000 + }, + { + "epoch": 1.0556948660419692, + "eval_loss": 0.14648379385471344, + "eval_runtime": 75.7964, + "eval_samples_per_second": 88.857, + "eval_steps_per_second": 22.217, + "num_input_tokens_seen": 3618672, + "step": 4000 + }, + { + "epoch": 1.057014649597466, + "grad_norm": 0.008738157339394093, + "learning_rate": 0.29264390832180853, + "loss": 0.1209, + "num_input_tokens_seen": 3623312, + "step": 4005 + }, + { + "epoch": 1.058334433152963, + "grad_norm": 0.008272713981568813, + "learning_rate": 0.29262567712116144, + "loss": 0.1538, + "num_input_tokens_seen": 3627856, + "step": 4010 + }, + { + "epoch": 1.0596542167084597, + "grad_norm": 0.012083081528544426, + "learning_rate": 0.29260742392584266, + "loss": 0.1465, + "num_input_tokens_seen": 3632368, + "step": 4015 + }, + { + "epoch": 1.0609740002639567, + "grad_norm": 0.005341961048543453, + "learning_rate": 0.292589148738667, + "loss": 0.1009, + "num_input_tokens_seen": 3636784, + "step": 4020 + }, + { + "epoch": 1.0622937838194537, + "grad_norm": 0.00660498533397913, + "learning_rate": 0.2925708515624527, + "loss": 0.1577, + "num_input_tokens_seen": 3641296, + "step": 4025 + }, + { + "epoch": 1.0636135673749505, + "grad_norm": 0.006869509816169739, + "learning_rate": 0.29255253240002144, + "loss": 0.1402, + "num_input_tokens_seen": 3646000, + "step": 4030 + }, + { + "epoch": 1.0649333509304475, + "grad_norm": 0.005050722975283861, + "learning_rate": 0.2925341912541983, + "loss": 0.1315, + "num_input_tokens_seen": 3650416, + "step": 4035 + }, + { + "epoch": 1.0662531344859443, + "grad_norm": 0.006346312817186117, + "learning_rate": 0.2925158281278116, + "loss": 0.1312, + "num_input_tokens_seen": 3655056, + "step": 4040 + }, + { + "epoch": 1.0675729180414413, + "grad_norm": 0.0060324156656861305, + "learning_rate": 0.29249744302369324, + "loss": 0.1538, + "num_input_tokens_seen": 3659504, + "step": 4045 + }, + { + "epoch": 1.068892701596938, + "grad_norm": 0.009989278391003609, + "learning_rate": 0.29247903594467844, + "loss": 0.1464, + "num_input_tokens_seen": 3663984, + "step": 4050 + }, + { + "epoch": 1.070212485152435, + "grad_norm": 0.005026288330554962, + "learning_rate": 0.2924606068936058, + "loss": 0.1224, + "num_input_tokens_seen": 3668400, + "step": 4055 + }, + { + "epoch": 1.0715322687079318, + "grad_norm": 0.008097591809928417, + "learning_rate": 0.2924421558733173, + "loss": 0.1525, + "num_input_tokens_seen": 3672912, + "step": 4060 + }, + { + "epoch": 1.0728520522634288, + "grad_norm": 0.006842585746198893, + "learning_rate": 0.2924236828866583, + "loss": 0.0948, + "num_input_tokens_seen": 3677680, + "step": 4065 + }, + { + "epoch": 1.0741718358189256, + "grad_norm": 0.00540498923510313, + "learning_rate": 0.29240518793647763, + "loss": 0.1465, + "num_input_tokens_seen": 3682224, + "step": 4070 + }, + { + "epoch": 1.0754916193744226, + "grad_norm": 0.008019238710403442, + "learning_rate": 0.29238667102562743, + "loss": 0.1419, + "num_input_tokens_seen": 3686672, + "step": 4075 + }, + { + "epoch": 1.0768114029299194, + "grad_norm": 0.007614405360072851, + "learning_rate": 0.29236813215696317, + "loss": 0.1635, + "num_input_tokens_seen": 3691280, + "step": 4080 + }, + { + "epoch": 1.0781311864854164, + "grad_norm": 0.005668691359460354, + "learning_rate": 0.2923495713333439, + "loss": 0.1518, + "num_input_tokens_seen": 3695920, + "step": 4085 + }, + { + "epoch": 1.0794509700409134, + "grad_norm": 0.005253260489553213, + "learning_rate": 0.29233098855763173, + "loss": 0.1175, + "num_input_tokens_seen": 3700496, + "step": 4090 + }, + { + "epoch": 1.0807707535964102, + "grad_norm": 0.007714542560279369, + "learning_rate": 0.29231238383269254, + "loss": 0.1049, + "num_input_tokens_seen": 3705200, + "step": 4095 + }, + { + "epoch": 1.0820905371519072, + "grad_norm": 0.006931937765330076, + "learning_rate": 0.2922937571613954, + "loss": 0.0983, + "num_input_tokens_seen": 3709776, + "step": 4100 + }, + { + "epoch": 1.083410320707404, + "grad_norm": 0.007913459092378616, + "learning_rate": 0.29227510854661265, + "loss": 0.1215, + "num_input_tokens_seen": 3714544, + "step": 4105 + }, + { + "epoch": 1.084730104262901, + "grad_norm": 0.0188645850867033, + "learning_rate": 0.29225643799122025, + "loss": 0.1612, + "num_input_tokens_seen": 3718832, + "step": 4110 + }, + { + "epoch": 1.0860498878183977, + "grad_norm": 0.0071865045465528965, + "learning_rate": 0.2922377454980974, + "loss": 0.1389, + "num_input_tokens_seen": 3723184, + "step": 4115 + }, + { + "epoch": 1.0873696713738947, + "grad_norm": 0.007775878068059683, + "learning_rate": 0.29221903107012676, + "loss": 0.1585, + "num_input_tokens_seen": 3727792, + "step": 4120 + }, + { + "epoch": 1.0886894549293915, + "grad_norm": 0.004502252209931612, + "learning_rate": 0.29220029471019426, + "loss": 0.1051, + "num_input_tokens_seen": 3732272, + "step": 4125 + }, + { + "epoch": 1.0900092384848885, + "grad_norm": 0.003894754219800234, + "learning_rate": 0.2921815364211893, + "loss": 0.1265, + "num_input_tokens_seen": 3736848, + "step": 4130 + }, + { + "epoch": 1.0913290220403853, + "grad_norm": 0.007309382315725088, + "learning_rate": 0.29216275620600474, + "loss": 0.1177, + "num_input_tokens_seen": 3741488, + "step": 4135 + }, + { + "epoch": 1.0926488055958823, + "grad_norm": 0.003994531463831663, + "learning_rate": 0.29214395406753657, + "loss": 0.1191, + "num_input_tokens_seen": 3746256, + "step": 4140 + }, + { + "epoch": 1.093968589151379, + "grad_norm": 0.004549433942884207, + "learning_rate": 0.2921251300086844, + "loss": 0.1069, + "num_input_tokens_seen": 3750928, + "step": 4145 + }, + { + "epoch": 1.095288372706876, + "grad_norm": 0.006482400000095367, + "learning_rate": 0.2921062840323511, + "loss": 0.1212, + "num_input_tokens_seen": 3755728, + "step": 4150 + }, + { + "epoch": 1.096608156262373, + "grad_norm": 0.010825647972524166, + "learning_rate": 0.29208741614144307, + "loss": 0.1225, + "num_input_tokens_seen": 3760304, + "step": 4155 + }, + { + "epoch": 1.0979279398178698, + "grad_norm": 0.007442600559443235, + "learning_rate": 0.2920685263388698, + "loss": 0.152, + "num_input_tokens_seen": 3764848, + "step": 4160 + }, + { + "epoch": 1.0992477233733668, + "grad_norm": 0.013787897303700447, + "learning_rate": 0.2920496146275445, + "loss": 0.1668, + "num_input_tokens_seen": 3768912, + "step": 4165 + }, + { + "epoch": 1.1005675069288636, + "grad_norm": 0.003761577419936657, + "learning_rate": 0.29203068101038343, + "loss": 0.1078, + "num_input_tokens_seen": 3773520, + "step": 4170 + }, + { + "epoch": 1.1018872904843606, + "grad_norm": 0.004268100950866938, + "learning_rate": 0.2920117254903065, + "loss": 0.1362, + "num_input_tokens_seen": 3778256, + "step": 4175 + }, + { + "epoch": 1.1032070740398574, + "grad_norm": 0.009740371257066727, + "learning_rate": 0.29199274807023695, + "loss": 0.1492, + "num_input_tokens_seen": 3782480, + "step": 4180 + }, + { + "epoch": 1.1045268575953544, + "grad_norm": 0.0060446858406066895, + "learning_rate": 0.29197374875310117, + "loss": 0.1377, + "num_input_tokens_seen": 3787088, + "step": 4185 + }, + { + "epoch": 1.1058466411508512, + "grad_norm": 0.005509675946086645, + "learning_rate": 0.2919547275418292, + "loss": 0.1137, + "num_input_tokens_seen": 3791600, + "step": 4190 + }, + { + "epoch": 1.1071664247063482, + "grad_norm": 0.007335728034377098, + "learning_rate": 0.29193568443935436, + "loss": 0.1096, + "num_input_tokens_seen": 3796368, + "step": 4195 + }, + { + "epoch": 1.108486208261845, + "grad_norm": 0.012188509106636047, + "learning_rate": 0.2919166194486133, + "loss": 0.178, + "num_input_tokens_seen": 3800592, + "step": 4200 + }, + { + "epoch": 1.108486208261845, + "eval_loss": 0.14308056235313416, + "eval_runtime": 75.9392, + "eval_samples_per_second": 88.689, + "eval_steps_per_second": 22.176, + "num_input_tokens_seen": 3800592, + "step": 4200 + }, + { + "epoch": 1.109805991817342, + "grad_norm": 0.004779187496751547, + "learning_rate": 0.2918975325725461, + "loss": 0.1156, + "num_input_tokens_seen": 3805008, + "step": 4205 + }, + { + "epoch": 1.1111257753728387, + "grad_norm": 0.005727575160562992, + "learning_rate": 0.29187842381409607, + "loss": 0.0908, + "num_input_tokens_seen": 3809712, + "step": 4210 + }, + { + "epoch": 1.1124455589283357, + "grad_norm": 0.009749376215040684, + "learning_rate": 0.29185929317621023, + "loss": 0.1176, + "num_input_tokens_seen": 3814032, + "step": 4215 + }, + { + "epoch": 1.1137653424838327, + "grad_norm": 0.009927963837981224, + "learning_rate": 0.29184014066183867, + "loss": 0.1277, + "num_input_tokens_seen": 3818320, + "step": 4220 + }, + { + "epoch": 1.1150851260393295, + "grad_norm": 0.006524521391838789, + "learning_rate": 0.2918209662739349, + "loss": 0.1313, + "num_input_tokens_seen": 3823120, + "step": 4225 + }, + { + "epoch": 1.1164049095948265, + "grad_norm": 0.0033101036678999662, + "learning_rate": 0.29180177001545593, + "loss": 0.1191, + "num_input_tokens_seen": 3827664, + "step": 4230 + }, + { + "epoch": 1.1177246931503233, + "grad_norm": 0.008306062780320644, + "learning_rate": 0.29178255188936203, + "loss": 0.1529, + "num_input_tokens_seen": 3831888, + "step": 4235 + }, + { + "epoch": 1.1190444767058203, + "grad_norm": 0.0051909261383116245, + "learning_rate": 0.2917633118986169, + "loss": 0.1182, + "num_input_tokens_seen": 3836272, + "step": 4240 + }, + { + "epoch": 1.120364260261317, + "grad_norm": 0.008655146695673466, + "learning_rate": 0.2917440500461875, + "loss": 0.1619, + "num_input_tokens_seen": 3840752, + "step": 4245 + }, + { + "epoch": 1.121684043816814, + "grad_norm": 0.0026306286454200745, + "learning_rate": 0.29172476633504435, + "loss": 0.1085, + "num_input_tokens_seen": 3845072, + "step": 4250 + }, + { + "epoch": 1.1230038273723109, + "grad_norm": 0.006750341504812241, + "learning_rate": 0.2917054607681612, + "loss": 0.1329, + "num_input_tokens_seen": 3849552, + "step": 4255 + }, + { + "epoch": 1.1243236109278079, + "grad_norm": 0.0061124349012970924, + "learning_rate": 0.29168613334851523, + "loss": 0.1497, + "num_input_tokens_seen": 3854320, + "step": 4260 + }, + { + "epoch": 1.1256433944833049, + "grad_norm": 0.0038168751634657383, + "learning_rate": 0.2916667840790869, + "loss": 0.0873, + "num_input_tokens_seen": 3858800, + "step": 4265 + }, + { + "epoch": 1.1269631780388016, + "grad_norm": 0.005847475025802851, + "learning_rate": 0.2916474129628603, + "loss": 0.1075, + "num_input_tokens_seen": 3863440, + "step": 4270 + }, + { + "epoch": 1.1282829615942984, + "grad_norm": 0.013407524675130844, + "learning_rate": 0.29162802000282245, + "loss": 0.1264, + "num_input_tokens_seen": 3867952, + "step": 4275 + }, + { + "epoch": 1.1296027451497954, + "grad_norm": 0.006543317344039679, + "learning_rate": 0.2916086052019642, + "loss": 0.1226, + "num_input_tokens_seen": 3872528, + "step": 4280 + }, + { + "epoch": 1.1309225287052924, + "grad_norm": 0.005400500725954771, + "learning_rate": 0.2915891685632794, + "loss": 0.1699, + "num_input_tokens_seen": 3876880, + "step": 4285 + }, + { + "epoch": 1.1322423122607892, + "grad_norm": 0.007880859076976776, + "learning_rate": 0.29156971008976545, + "loss": 0.1395, + "num_input_tokens_seen": 3881520, + "step": 4290 + }, + { + "epoch": 1.1335620958162862, + "grad_norm": 0.007654520217329264, + "learning_rate": 0.2915502297844232, + "loss": 0.1451, + "num_input_tokens_seen": 3885936, + "step": 4295 + }, + { + "epoch": 1.134881879371783, + "grad_norm": 0.008880753070116043, + "learning_rate": 0.2915307276502566, + "loss": 0.1315, + "num_input_tokens_seen": 3890832, + "step": 4300 + }, + { + "epoch": 1.13620166292728, + "grad_norm": 0.004152935463935137, + "learning_rate": 0.29151120369027334, + "loss": 0.1083, + "num_input_tokens_seen": 3895408, + "step": 4305 + }, + { + "epoch": 1.1375214464827768, + "grad_norm": 0.007034148555248976, + "learning_rate": 0.29149165790748405, + "loss": 0.1302, + "num_input_tokens_seen": 3899728, + "step": 4310 + }, + { + "epoch": 1.1388412300382738, + "grad_norm": 0.007364292629063129, + "learning_rate": 0.291472090304903, + "loss": 0.1234, + "num_input_tokens_seen": 3904272, + "step": 4315 + }, + { + "epoch": 1.1401610135937705, + "grad_norm": 0.009354576468467712, + "learning_rate": 0.2914525008855478, + "loss": 0.0984, + "num_input_tokens_seen": 3908720, + "step": 4320 + }, + { + "epoch": 1.1414807971492675, + "grad_norm": 0.010651948861777782, + "learning_rate": 0.2914328896524394, + "loss": 0.0923, + "num_input_tokens_seen": 3913200, + "step": 4325 + }, + { + "epoch": 1.1428005807047645, + "grad_norm": 0.0058954013511538506, + "learning_rate": 0.291413256608602, + "loss": 0.1016, + "num_input_tokens_seen": 3917616, + "step": 4330 + }, + { + "epoch": 1.1441203642602613, + "grad_norm": 0.0059235249646008015, + "learning_rate": 0.29139360175706336, + "loss": 0.0778, + "num_input_tokens_seen": 3922000, + "step": 4335 + }, + { + "epoch": 1.145440147815758, + "grad_norm": 0.0057116420939564705, + "learning_rate": 0.2913739251008544, + "loss": 0.1355, + "num_input_tokens_seen": 3926288, + "step": 4340 + }, + { + "epoch": 1.146759931371255, + "grad_norm": 0.007830889895558357, + "learning_rate": 0.29135422664300964, + "loss": 0.0874, + "num_input_tokens_seen": 3930704, + "step": 4345 + }, + { + "epoch": 1.148079714926752, + "grad_norm": 0.004470753017812967, + "learning_rate": 0.29133450638656677, + "loss": 0.1282, + "num_input_tokens_seen": 3935376, + "step": 4350 + }, + { + "epoch": 1.1493994984822489, + "grad_norm": 0.0052034808322787285, + "learning_rate": 0.2913147643345669, + "loss": 0.1876, + "num_input_tokens_seen": 3939728, + "step": 4355 + }, + { + "epoch": 1.1507192820377459, + "grad_norm": 0.009258999489247799, + "learning_rate": 0.29129500049005447, + "loss": 0.148, + "num_input_tokens_seen": 3944528, + "step": 4360 + }, + { + "epoch": 1.1520390655932427, + "grad_norm": 0.006445402279496193, + "learning_rate": 0.2912752148560773, + "loss": 0.141, + "num_input_tokens_seen": 3949040, + "step": 4365 + }, + { + "epoch": 1.1533588491487397, + "grad_norm": 0.005174792371690273, + "learning_rate": 0.2912554074356866, + "loss": 0.1202, + "num_input_tokens_seen": 3953584, + "step": 4370 + }, + { + "epoch": 1.1546786327042364, + "grad_norm": 0.008898233994841576, + "learning_rate": 0.2912355782319371, + "loss": 0.1214, + "num_input_tokens_seen": 3958096, + "step": 4375 + }, + { + "epoch": 1.1559984162597334, + "grad_norm": 0.006105615757405758, + "learning_rate": 0.2912157272478864, + "loss": 0.1402, + "num_input_tokens_seen": 3962480, + "step": 4380 + }, + { + "epoch": 1.1573181998152302, + "grad_norm": 0.01053818129003048, + "learning_rate": 0.291195854486596, + "loss": 0.1285, + "num_input_tokens_seen": 3967024, + "step": 4385 + }, + { + "epoch": 1.1586379833707272, + "grad_norm": 0.004836898297071457, + "learning_rate": 0.2911759599511305, + "loss": 0.109, + "num_input_tokens_seen": 3971824, + "step": 4390 + }, + { + "epoch": 1.1599577669262242, + "grad_norm": 0.013931578025221825, + "learning_rate": 0.29115604364455777, + "loss": 0.1579, + "num_input_tokens_seen": 3976240, + "step": 4395 + }, + { + "epoch": 1.161277550481721, + "grad_norm": 0.009723753668367863, + "learning_rate": 0.2911361055699493, + "loss": 0.1298, + "num_input_tokens_seen": 3980592, + "step": 4400 + }, + { + "epoch": 1.161277550481721, + "eval_loss": 0.15018178522586823, + "eval_runtime": 75.8869, + "eval_samples_per_second": 88.75, + "eval_steps_per_second": 22.191, + "num_input_tokens_seen": 3980592, + "step": 4400 + }, + { + "epoch": 1.162597334037218, + "grad_norm": 0.007469015195965767, + "learning_rate": 0.2911161457303797, + "loss": 0.1358, + "num_input_tokens_seen": 3984848, + "step": 4405 + }, + { + "epoch": 1.1639171175927148, + "grad_norm": 0.00684467563405633, + "learning_rate": 0.291096164128927, + "loss": 0.1335, + "num_input_tokens_seen": 3989232, + "step": 4410 + }, + { + "epoch": 1.1652369011482118, + "grad_norm": 0.011707705445587635, + "learning_rate": 0.2910761607686727, + "loss": 0.1788, + "num_input_tokens_seen": 3993648, + "step": 4415 + }, + { + "epoch": 1.1665566847037085, + "grad_norm": 0.0038322831969708204, + "learning_rate": 0.2910561356527016, + "loss": 0.1134, + "num_input_tokens_seen": 3998160, + "step": 4420 + }, + { + "epoch": 1.1678764682592055, + "grad_norm": 0.007062537129968405, + "learning_rate": 0.2910360887841017, + "loss": 0.1142, + "num_input_tokens_seen": 4002704, + "step": 4425 + }, + { + "epoch": 1.1691962518147023, + "grad_norm": 0.0030066086910665035, + "learning_rate": 0.2910160201659645, + "loss": 0.089, + "num_input_tokens_seen": 4007120, + "step": 4430 + }, + { + "epoch": 1.1705160353701993, + "grad_norm": 0.005000459961593151, + "learning_rate": 0.29099592980138494, + "loss": 0.1037, + "num_input_tokens_seen": 4011312, + "step": 4435 + }, + { + "epoch": 1.171835818925696, + "grad_norm": 0.008020681329071522, + "learning_rate": 0.29097581769346115, + "loss": 0.133, + "num_input_tokens_seen": 4015952, + "step": 4440 + }, + { + "epoch": 1.173155602481193, + "grad_norm": 0.011314941570162773, + "learning_rate": 0.29095568384529463, + "loss": 0.107, + "num_input_tokens_seen": 4020784, + "step": 4445 + }, + { + "epoch": 1.1744753860366899, + "grad_norm": 0.006983778905123472, + "learning_rate": 0.2909355282599903, + "loss": 0.123, + "num_input_tokens_seen": 4025264, + "step": 4450 + }, + { + "epoch": 1.1757951695921869, + "grad_norm": 0.011290546506643295, + "learning_rate": 0.29091535094065635, + "loss": 0.1274, + "num_input_tokens_seen": 4029872, + "step": 4455 + }, + { + "epoch": 1.1771149531476839, + "grad_norm": 0.007481167558580637, + "learning_rate": 0.2908951518904045, + "loss": 0.1417, + "num_input_tokens_seen": 4034384, + "step": 4460 + }, + { + "epoch": 1.1784347367031807, + "grad_norm": 0.005002445541322231, + "learning_rate": 0.29087493111234963, + "loss": 0.1464, + "num_input_tokens_seen": 4038608, + "step": 4465 + }, + { + "epoch": 1.1797545202586777, + "grad_norm": 0.007125604432076216, + "learning_rate": 0.29085468860961, + "loss": 0.1578, + "num_input_tokens_seen": 4043120, + "step": 4470 + }, + { + "epoch": 1.1810743038141744, + "grad_norm": 0.006201822776347399, + "learning_rate": 0.2908344243853073, + "loss": 0.105, + "num_input_tokens_seen": 4047824, + "step": 4475 + }, + { + "epoch": 1.1823940873696714, + "grad_norm": 0.008466248400509357, + "learning_rate": 0.2908141384425666, + "loss": 0.1267, + "num_input_tokens_seen": 4052336, + "step": 4480 + }, + { + "epoch": 1.1837138709251682, + "grad_norm": 0.009125655516982079, + "learning_rate": 0.2907938307845161, + "loss": 0.1344, + "num_input_tokens_seen": 4057040, + "step": 4485 + }, + { + "epoch": 1.1850336544806652, + "grad_norm": 0.009750205092132092, + "learning_rate": 0.2907735014142876, + "loss": 0.1451, + "num_input_tokens_seen": 4061456, + "step": 4490 + }, + { + "epoch": 1.186353438036162, + "grad_norm": 0.008283338509500027, + "learning_rate": 0.2907531503350161, + "loss": 0.1233, + "num_input_tokens_seen": 4065776, + "step": 4495 + }, + { + "epoch": 1.187673221591659, + "grad_norm": 0.0054720500484108925, + "learning_rate": 0.29073277754983995, + "loss": 0.1631, + "num_input_tokens_seen": 4070704, + "step": 4500 + }, + { + "epoch": 1.1889930051471558, + "grad_norm": 0.004313196986913681, + "learning_rate": 0.290712383061901, + "loss": 0.1358, + "num_input_tokens_seen": 4075216, + "step": 4505 + }, + { + "epoch": 1.1903127887026528, + "grad_norm": 0.005440705921500921, + "learning_rate": 0.2906919668743443, + "loss": 0.1576, + "num_input_tokens_seen": 4079696, + "step": 4510 + }, + { + "epoch": 1.1916325722581496, + "grad_norm": 0.005633278749883175, + "learning_rate": 0.29067152899031823, + "loss": 0.0993, + "num_input_tokens_seen": 4084240, + "step": 4515 + }, + { + "epoch": 1.1929523558136466, + "grad_norm": 0.003586522303521633, + "learning_rate": 0.2906510694129746, + "loss": 0.1142, + "num_input_tokens_seen": 4088624, + "step": 4520 + }, + { + "epoch": 1.1942721393691436, + "grad_norm": 0.013463805429637432, + "learning_rate": 0.2906305881454685, + "loss": 0.1387, + "num_input_tokens_seen": 4092592, + "step": 4525 + }, + { + "epoch": 1.1955919229246403, + "grad_norm": 0.006284086499363184, + "learning_rate": 0.2906100851909585, + "loss": 0.1207, + "num_input_tokens_seen": 4097680, + "step": 4530 + }, + { + "epoch": 1.1969117064801373, + "grad_norm": 0.004781382158398628, + "learning_rate": 0.29058956055260626, + "loss": 0.1182, + "num_input_tokens_seen": 4102480, + "step": 4535 + }, + { + "epoch": 1.1982314900356341, + "grad_norm": 0.005833441391587257, + "learning_rate": 0.2905690142335771, + "loss": 0.1273, + "num_input_tokens_seen": 4106864, + "step": 4540 + }, + { + "epoch": 1.1995512735911311, + "grad_norm": 0.006971859373152256, + "learning_rate": 0.29054844623703946, + "loss": 0.1157, + "num_input_tokens_seen": 4111504, + "step": 4545 + }, + { + "epoch": 1.200871057146628, + "grad_norm": 0.0030139326117932796, + "learning_rate": 0.2905278565661651, + "loss": 0.1274, + "num_input_tokens_seen": 4116208, + "step": 4550 + }, + { + "epoch": 1.202190840702125, + "grad_norm": 0.008300424553453922, + "learning_rate": 0.2905072452241293, + "loss": 0.1568, + "num_input_tokens_seen": 4120848, + "step": 4555 + }, + { + "epoch": 1.2035106242576217, + "grad_norm": 0.007644175551831722, + "learning_rate": 0.2904866122141106, + "loss": 0.1443, + "num_input_tokens_seen": 4125360, + "step": 4560 + }, + { + "epoch": 1.2048304078131187, + "grad_norm": 0.01061201561242342, + "learning_rate": 0.2904659575392908, + "loss": 0.1279, + "num_input_tokens_seen": 4129680, + "step": 4565 + }, + { + "epoch": 1.2061501913686155, + "grad_norm": 0.007456521037966013, + "learning_rate": 0.2904452812028551, + "loss": 0.1273, + "num_input_tokens_seen": 4134256, + "step": 4570 + }, + { + "epoch": 1.2074699749241125, + "grad_norm": 0.009045533835887909, + "learning_rate": 0.2904245832079922, + "loss": 0.1675, + "num_input_tokens_seen": 4139312, + "step": 4575 + }, + { + "epoch": 1.2087897584796092, + "grad_norm": 0.005348899867385626, + "learning_rate": 0.29040386355789377, + "loss": 0.1189, + "num_input_tokens_seen": 4144080, + "step": 4580 + }, + { + "epoch": 1.2101095420351062, + "grad_norm": 0.00588280288502574, + "learning_rate": 0.29038312225575524, + "loss": 0.1408, + "num_input_tokens_seen": 4148592, + "step": 4585 + }, + { + "epoch": 1.2114293255906032, + "grad_norm": 0.004976367112249136, + "learning_rate": 0.29036235930477505, + "loss": 0.1027, + "num_input_tokens_seen": 4153008, + "step": 4590 + }, + { + "epoch": 1.2127491091461, + "grad_norm": 0.00499043008312583, + "learning_rate": 0.29034157470815514, + "loss": 0.1098, + "num_input_tokens_seen": 4157840, + "step": 4595 + }, + { + "epoch": 1.214068892701597, + "grad_norm": 0.00825877021998167, + "learning_rate": 0.2903207684691008, + "loss": 0.1262, + "num_input_tokens_seen": 4161936, + "step": 4600 + }, + { + "epoch": 1.214068892701597, + "eval_loss": 0.15943706035614014, + "eval_runtime": 75.8723, + "eval_samples_per_second": 88.768, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 4161936, + "step": 4600 + }, + { + "epoch": 1.2153886762570938, + "grad_norm": 0.0063225021585822105, + "learning_rate": 0.29029994059082054, + "loss": 0.1284, + "num_input_tokens_seen": 4166160, + "step": 4605 + }, + { + "epoch": 1.2167084598125908, + "grad_norm": 0.005541889928281307, + "learning_rate": 0.2902790910765264, + "loss": 0.1412, + "num_input_tokens_seen": 4170768, + "step": 4610 + }, + { + "epoch": 1.2180282433680876, + "grad_norm": 0.005383352283388376, + "learning_rate": 0.29025821992943346, + "loss": 0.1238, + "num_input_tokens_seen": 4175344, + "step": 4615 + }, + { + "epoch": 1.2193480269235846, + "grad_norm": 0.007081274874508381, + "learning_rate": 0.29023732715276046, + "loss": 0.1693, + "num_input_tokens_seen": 4179824, + "step": 4620 + }, + { + "epoch": 1.2206678104790813, + "grad_norm": 0.006802636198699474, + "learning_rate": 0.2902164127497293, + "loss": 0.1358, + "num_input_tokens_seen": 4183920, + "step": 4625 + }, + { + "epoch": 1.2219875940345784, + "grad_norm": 0.0054868292063474655, + "learning_rate": 0.2901954767235652, + "loss": 0.1148, + "num_input_tokens_seen": 4188624, + "step": 4630 + }, + { + "epoch": 1.2233073775900754, + "grad_norm": 0.004916409496217966, + "learning_rate": 0.2901745190774968, + "loss": 0.1441, + "num_input_tokens_seen": 4193136, + "step": 4635 + }, + { + "epoch": 1.2246271611455721, + "grad_norm": 0.00464195990934968, + "learning_rate": 0.290153539814756, + "loss": 0.1171, + "num_input_tokens_seen": 4197648, + "step": 4640 + }, + { + "epoch": 1.225946944701069, + "grad_norm": 0.007062013726681471, + "learning_rate": 0.2901325389385781, + "loss": 0.0968, + "num_input_tokens_seen": 4202224, + "step": 4645 + }, + { + "epoch": 1.227266728256566, + "grad_norm": 0.004339401610195637, + "learning_rate": 0.2901115164522016, + "loss": 0.1027, + "num_input_tokens_seen": 4206928, + "step": 4650 + }, + { + "epoch": 1.228586511812063, + "grad_norm": 0.004561142064630985, + "learning_rate": 0.29009047235886865, + "loss": 0.0879, + "num_input_tokens_seen": 4211216, + "step": 4655 + }, + { + "epoch": 1.2299062953675597, + "grad_norm": 0.005037636961787939, + "learning_rate": 0.2900694066618243, + "loss": 0.1183, + "num_input_tokens_seen": 4215440, + "step": 4660 + }, + { + "epoch": 1.2312260789230567, + "grad_norm": 0.0074766650795936584, + "learning_rate": 0.2900483193643172, + "loss": 0.0983, + "num_input_tokens_seen": 4220048, + "step": 4665 + }, + { + "epoch": 1.2325458624785535, + "grad_norm": 0.004176498856395483, + "learning_rate": 0.29002721046959934, + "loss": 0.1378, + "num_input_tokens_seen": 4224848, + "step": 4670 + }, + { + "epoch": 1.2338656460340505, + "grad_norm": 0.012122509069740772, + "learning_rate": 0.29000607998092587, + "loss": 0.1475, + "num_input_tokens_seen": 4229552, + "step": 4675 + }, + { + "epoch": 1.2351854295895472, + "grad_norm": 0.010444805957376957, + "learning_rate": 0.2899849279015555, + "loss": 0.1808, + "num_input_tokens_seen": 4234384, + "step": 4680 + }, + { + "epoch": 1.2365052131450442, + "grad_norm": 0.00795635487884283, + "learning_rate": 0.28996375423475007, + "loss": 0.1234, + "num_input_tokens_seen": 4239312, + "step": 4685 + }, + { + "epoch": 1.237824996700541, + "grad_norm": 0.007227595429867506, + "learning_rate": 0.28994255898377486, + "loss": 0.146, + "num_input_tokens_seen": 4244048, + "step": 4690 + }, + { + "epoch": 1.239144780256038, + "grad_norm": 0.00508686201646924, + "learning_rate": 0.2899213421518984, + "loss": 0.1058, + "num_input_tokens_seen": 4248464, + "step": 4695 + }, + { + "epoch": 1.240464563811535, + "grad_norm": 0.00791617389768362, + "learning_rate": 0.2899001037423926, + "loss": 0.1317, + "num_input_tokens_seen": 4253104, + "step": 4700 + }, + { + "epoch": 1.2417843473670318, + "grad_norm": 0.007185694761574268, + "learning_rate": 0.28987884375853273, + "loss": 0.1141, + "num_input_tokens_seen": 4257648, + "step": 4705 + }, + { + "epoch": 1.2431041309225286, + "grad_norm": 0.010027085430920124, + "learning_rate": 0.2898575622035974, + "loss": 0.1218, + "num_input_tokens_seen": 4262128, + "step": 4710 + }, + { + "epoch": 1.2444239144780256, + "grad_norm": 0.0055501945316791534, + "learning_rate": 0.2898362590808683, + "loss": 0.1314, + "num_input_tokens_seen": 4266480, + "step": 4715 + }, + { + "epoch": 1.2457436980335226, + "grad_norm": 0.0041010682471096516, + "learning_rate": 0.2898149343936308, + "loss": 0.1347, + "num_input_tokens_seen": 4271152, + "step": 4720 + }, + { + "epoch": 1.2470634815890194, + "grad_norm": 0.004378339741379023, + "learning_rate": 0.2897935881451734, + "loss": 0.1489, + "num_input_tokens_seen": 4275536, + "step": 4725 + }, + { + "epoch": 1.2483832651445164, + "grad_norm": 0.005178438033908606, + "learning_rate": 0.28977222033878797, + "loss": 0.1068, + "num_input_tokens_seen": 4280080, + "step": 4730 + }, + { + "epoch": 1.2497030487000131, + "grad_norm": 0.00825351383537054, + "learning_rate": 0.28975083097776966, + "loss": 0.1527, + "num_input_tokens_seen": 4284528, + "step": 4735 + }, + { + "epoch": 1.2510228322555101, + "grad_norm": 0.006269282195717096, + "learning_rate": 0.28972942006541696, + "loss": 0.1098, + "num_input_tokens_seen": 4288912, + "step": 4740 + }, + { + "epoch": 1.252342615811007, + "grad_norm": 0.0034152015578001738, + "learning_rate": 0.2897079876050318, + "loss": 0.1113, + "num_input_tokens_seen": 4293424, + "step": 4745 + }, + { + "epoch": 1.253662399366504, + "grad_norm": 0.007530711591243744, + "learning_rate": 0.2896865335999192, + "loss": 0.1272, + "num_input_tokens_seen": 4297968, + "step": 4750 + }, + { + "epoch": 1.2549821829220007, + "grad_norm": 0.007087291218340397, + "learning_rate": 0.28966505805338777, + "loss": 0.1602, + "num_input_tokens_seen": 4302448, + "step": 4755 + }, + { + "epoch": 1.2563019664774977, + "grad_norm": 0.0051059415563941, + "learning_rate": 0.2896435609687492, + "loss": 0.1465, + "num_input_tokens_seen": 4307184, + "step": 4760 + }, + { + "epoch": 1.2576217500329947, + "grad_norm": 0.006313722115010023, + "learning_rate": 0.2896220423493187, + "loss": 0.1198, + "num_input_tokens_seen": 4311856, + "step": 4765 + }, + { + "epoch": 1.2589415335884915, + "grad_norm": 0.0060476865619421005, + "learning_rate": 0.28960050219841466, + "loss": 0.1297, + "num_input_tokens_seen": 4316208, + "step": 4770 + }, + { + "epoch": 1.2602613171439883, + "grad_norm": 0.0069470033049583435, + "learning_rate": 0.28957894051935884, + "loss": 0.1049, + "num_input_tokens_seen": 4320912, + "step": 4775 + }, + { + "epoch": 1.2615811006994853, + "grad_norm": 0.004824855364859104, + "learning_rate": 0.2895573573154764, + "loss": 0.1482, + "num_input_tokens_seen": 4325584, + "step": 4780 + }, + { + "epoch": 1.2629008842549823, + "grad_norm": 0.002742300508543849, + "learning_rate": 0.28953575259009556, + "loss": 0.1013, + "num_input_tokens_seen": 4329936, + "step": 4785 + }, + { + "epoch": 1.264220667810479, + "grad_norm": 0.0037561177741736174, + "learning_rate": 0.2895141263465482, + "loss": 0.1182, + "num_input_tokens_seen": 4334576, + "step": 4790 + }, + { + "epoch": 1.265540451365976, + "grad_norm": 0.0050350818783044815, + "learning_rate": 0.28949247858816934, + "loss": 0.1163, + "num_input_tokens_seen": 4338800, + "step": 4795 + }, + { + "epoch": 1.2668602349214728, + "grad_norm": 0.006403943989425898, + "learning_rate": 0.2894708093182973, + "loss": 0.1062, + "num_input_tokens_seen": 4343440, + "step": 4800 + }, + { + "epoch": 1.2668602349214728, + "eval_loss": 0.1273791790008545, + "eval_runtime": 75.8839, + "eval_samples_per_second": 88.754, + "eval_steps_per_second": 22.192, + "num_input_tokens_seen": 4343440, + "step": 4800 + }, + { + "epoch": 1.2681800184769698, + "grad_norm": 0.005659022834151983, + "learning_rate": 0.2894491185402737, + "loss": 0.1176, + "num_input_tokens_seen": 4347856, + "step": 4805 + }, + { + "epoch": 1.2694998020324666, + "grad_norm": 0.006581745110452175, + "learning_rate": 0.2894274062574437, + "loss": 0.0793, + "num_input_tokens_seen": 4352368, + "step": 4810 + }, + { + "epoch": 1.2708195855879636, + "grad_norm": 0.008169785141944885, + "learning_rate": 0.2894056724731554, + "loss": 0.1494, + "num_input_tokens_seen": 4357008, + "step": 4815 + }, + { + "epoch": 1.2721393691434604, + "grad_norm": 0.007960627786815166, + "learning_rate": 0.28938391719076056, + "loss": 0.1063, + "num_input_tokens_seen": 4361936, + "step": 4820 + }, + { + "epoch": 1.2734591526989574, + "grad_norm": 0.0033328793942928314, + "learning_rate": 0.28936214041361413, + "loss": 0.0941, + "num_input_tokens_seen": 4366576, + "step": 4825 + }, + { + "epoch": 1.2747789362544544, + "grad_norm": 0.005033844616264105, + "learning_rate": 0.2893403421450743, + "loss": 0.1362, + "num_input_tokens_seen": 4371024, + "step": 4830 + }, + { + "epoch": 1.2760987198099512, + "grad_norm": 0.004738932941108942, + "learning_rate": 0.2893185223885026, + "loss": 0.1348, + "num_input_tokens_seen": 4375664, + "step": 4835 + }, + { + "epoch": 1.277418503365448, + "grad_norm": 0.005771535914391279, + "learning_rate": 0.289296681147264, + "loss": 0.132, + "num_input_tokens_seen": 4380496, + "step": 4840 + }, + { + "epoch": 1.278738286920945, + "grad_norm": 0.009553518146276474, + "learning_rate": 0.28927481842472663, + "loss": 0.142, + "num_input_tokens_seen": 4385264, + "step": 4845 + }, + { + "epoch": 1.280058070476442, + "grad_norm": 0.004774996545165777, + "learning_rate": 0.28925293422426207, + "loss": 0.1162, + "num_input_tokens_seen": 4389808, + "step": 4850 + }, + { + "epoch": 1.2813778540319387, + "grad_norm": 0.004641884472221136, + "learning_rate": 0.28923102854924504, + "loss": 0.1038, + "num_input_tokens_seen": 4394480, + "step": 4855 + }, + { + "epoch": 1.2826976375874357, + "grad_norm": 0.008869183249771595, + "learning_rate": 0.2892091014030537, + "loss": 0.1378, + "num_input_tokens_seen": 4399152, + "step": 4860 + }, + { + "epoch": 1.2840174211429325, + "grad_norm": 0.004630484618246555, + "learning_rate": 0.2891871527890696, + "loss": 0.0891, + "num_input_tokens_seen": 4403760, + "step": 4865 + }, + { + "epoch": 1.2853372046984295, + "grad_norm": 0.006253865547478199, + "learning_rate": 0.2891651827106773, + "loss": 0.1207, + "num_input_tokens_seen": 4408016, + "step": 4870 + }, + { + "epoch": 1.2866569882539265, + "grad_norm": 0.006185167469084263, + "learning_rate": 0.2891431911712651, + "loss": 0.1219, + "num_input_tokens_seen": 4412592, + "step": 4875 + }, + { + "epoch": 1.2879767718094233, + "grad_norm": 0.009770900942385197, + "learning_rate": 0.2891211781742241, + "loss": 0.1406, + "num_input_tokens_seen": 4417040, + "step": 4880 + }, + { + "epoch": 1.28929655536492, + "grad_norm": 0.00822899118065834, + "learning_rate": 0.2890991437229492, + "loss": 0.0967, + "num_input_tokens_seen": 4421680, + "step": 4885 + }, + { + "epoch": 1.290616338920417, + "grad_norm": 0.006727734114974737, + "learning_rate": 0.2890770878208383, + "loss": 0.1422, + "num_input_tokens_seen": 4426128, + "step": 4890 + }, + { + "epoch": 1.291936122475914, + "grad_norm": 0.0057727801613509655, + "learning_rate": 0.28905501047129273, + "loss": 0.1162, + "num_input_tokens_seen": 4430448, + "step": 4895 + }, + { + "epoch": 1.2932559060314108, + "grad_norm": 0.0035950900055468082, + "learning_rate": 0.289032911677717, + "loss": 0.1017, + "num_input_tokens_seen": 4434896, + "step": 4900 + }, + { + "epoch": 1.2945756895869078, + "grad_norm": 0.009969254955649376, + "learning_rate": 0.28901079144351915, + "loss": 0.1271, + "num_input_tokens_seen": 4439824, + "step": 4905 + }, + { + "epoch": 1.2958954731424046, + "grad_norm": 0.0049923793412745, + "learning_rate": 0.2889886497721103, + "loss": 0.148, + "num_input_tokens_seen": 4444336, + "step": 4910 + }, + { + "epoch": 1.2972152566979016, + "grad_norm": 0.005502726417034864, + "learning_rate": 0.28896648666690505, + "loss": 0.1441, + "num_input_tokens_seen": 4448912, + "step": 4915 + }, + { + "epoch": 1.2985350402533984, + "grad_norm": 0.0048463959246873856, + "learning_rate": 0.2889443021313212, + "loss": 0.152, + "num_input_tokens_seen": 4453296, + "step": 4920 + }, + { + "epoch": 1.2998548238088954, + "grad_norm": 0.006493000779300928, + "learning_rate": 0.28892209616877984, + "loss": 0.1457, + "num_input_tokens_seen": 4458128, + "step": 4925 + }, + { + "epoch": 1.3011746073643922, + "grad_norm": 0.004292042460292578, + "learning_rate": 0.28889986878270546, + "loss": 0.1152, + "num_input_tokens_seen": 4463024, + "step": 4930 + }, + { + "epoch": 1.3024943909198892, + "grad_norm": 0.004914049059152603, + "learning_rate": 0.28887761997652583, + "loss": 0.1193, + "num_input_tokens_seen": 4467472, + "step": 4935 + }, + { + "epoch": 1.3038141744753862, + "grad_norm": 0.00792317371815443, + "learning_rate": 0.2888553497536719, + "loss": 0.1509, + "num_input_tokens_seen": 4472208, + "step": 4940 + }, + { + "epoch": 1.305133958030883, + "grad_norm": 0.00600992189720273, + "learning_rate": 0.2888330581175781, + "loss": 0.1368, + "num_input_tokens_seen": 4477200, + "step": 4945 + }, + { + "epoch": 1.3064537415863797, + "grad_norm": 0.010630965232849121, + "learning_rate": 0.28881074507168203, + "loss": 0.1043, + "num_input_tokens_seen": 4481616, + "step": 4950 + }, + { + "epoch": 1.3077735251418767, + "grad_norm": 0.008435067720711231, + "learning_rate": 0.2887884106194247, + "loss": 0.1235, + "num_input_tokens_seen": 4486320, + "step": 4955 + }, + { + "epoch": 1.3090933086973737, + "grad_norm": 0.005451956298202276, + "learning_rate": 0.28876605476425027, + "loss": 0.0881, + "num_input_tokens_seen": 4490960, + "step": 4960 + }, + { + "epoch": 1.3104130922528705, + "grad_norm": 0.0067622060887515545, + "learning_rate": 0.2887436775096064, + "loss": 0.1285, + "num_input_tokens_seen": 4495792, + "step": 4965 + }, + { + "epoch": 1.3117328758083675, + "grad_norm": 0.007347963750362396, + "learning_rate": 0.2887212788589439, + "loss": 0.1847, + "num_input_tokens_seen": 4500368, + "step": 4970 + }, + { + "epoch": 1.3130526593638643, + "grad_norm": 0.007676871959120035, + "learning_rate": 0.2886988588157169, + "loss": 0.1179, + "num_input_tokens_seen": 4504752, + "step": 4975 + }, + { + "epoch": 1.3143724429193613, + "grad_norm": 0.004374609794467688, + "learning_rate": 0.28867641738338284, + "loss": 0.1461, + "num_input_tokens_seen": 4509424, + "step": 4980 + }, + { + "epoch": 1.315692226474858, + "grad_norm": 0.0069887335412204266, + "learning_rate": 0.2886539545654026, + "loss": 0.1135, + "num_input_tokens_seen": 4513648, + "step": 4985 + }, + { + "epoch": 1.317012010030355, + "grad_norm": 0.005233681760728359, + "learning_rate": 0.28863147036524006, + "loss": 0.1196, + "num_input_tokens_seen": 4518128, + "step": 4990 + }, + { + "epoch": 1.3183317935858518, + "grad_norm": 0.0048226951621472836, + "learning_rate": 0.2886089647863626, + "loss": 0.1005, + "num_input_tokens_seen": 4522352, + "step": 4995 + }, + { + "epoch": 1.3196515771413488, + "grad_norm": 0.003932212945073843, + "learning_rate": 0.288586437832241, + "loss": 0.1005, + "num_input_tokens_seen": 4526576, + "step": 5000 + }, + { + "epoch": 1.3196515771413488, + "eval_loss": 0.12307132035493851, + "eval_runtime": 75.6975, + "eval_samples_per_second": 88.973, + "eval_steps_per_second": 22.246, + "num_input_tokens_seen": 4526576, + "step": 5000 + }, + { + "epoch": 1.3209713606968458, + "grad_norm": 0.007522172760218382, + "learning_rate": 0.28856388950634904, + "loss": 0.1249, + "num_input_tokens_seen": 4530992, + "step": 5005 + }, + { + "epoch": 1.3222911442523426, + "grad_norm": 0.006707256659865379, + "learning_rate": 0.288541319812164, + "loss": 0.1554, + "num_input_tokens_seen": 4535600, + "step": 5010 + }, + { + "epoch": 1.3236109278078394, + "grad_norm": 0.00563955819234252, + "learning_rate": 0.2885187287531665, + "loss": 0.1966, + "num_input_tokens_seen": 4540240, + "step": 5015 + }, + { + "epoch": 1.3249307113633364, + "grad_norm": 0.005279666744172573, + "learning_rate": 0.2884961163328402, + "loss": 0.1216, + "num_input_tokens_seen": 4544848, + "step": 5020 + }, + { + "epoch": 1.3262504949188334, + "grad_norm": 0.0050764246843755245, + "learning_rate": 0.28847348255467237, + "loss": 0.132, + "num_input_tokens_seen": 4549264, + "step": 5025 + }, + { + "epoch": 1.3275702784743302, + "grad_norm": 0.006592562887817621, + "learning_rate": 0.28845082742215333, + "loss": 0.1354, + "num_input_tokens_seen": 4553456, + "step": 5030 + }, + { + "epoch": 1.3288900620298272, + "grad_norm": 0.005607139319181442, + "learning_rate": 0.2884281509387769, + "loss": 0.1161, + "num_input_tokens_seen": 4557968, + "step": 5035 + }, + { + "epoch": 1.330209845585324, + "grad_norm": 0.004653626121580601, + "learning_rate": 0.2884054531080399, + "loss": 0.1332, + "num_input_tokens_seen": 4562192, + "step": 5040 + }, + { + "epoch": 1.331529629140821, + "grad_norm": 0.00600946182385087, + "learning_rate": 0.28838273393344277, + "loss": 0.1512, + "num_input_tokens_seen": 4566704, + "step": 5045 + }, + { + "epoch": 1.3328494126963177, + "grad_norm": 0.010720454156398773, + "learning_rate": 0.288359993418489, + "loss": 0.1369, + "num_input_tokens_seen": 4571088, + "step": 5050 + }, + { + "epoch": 1.3341691962518147, + "grad_norm": 0.0023548235185444355, + "learning_rate": 0.28833723156668556, + "loss": 0.0845, + "num_input_tokens_seen": 4575536, + "step": 5055 + }, + { + "epoch": 1.3354889798073115, + "grad_norm": 0.005503352265805006, + "learning_rate": 0.2883144483815425, + "loss": 0.1729, + "num_input_tokens_seen": 4579952, + "step": 5060 + }, + { + "epoch": 1.3368087633628085, + "grad_norm": 0.0031644832342863083, + "learning_rate": 0.28829164386657335, + "loss": 0.1251, + "num_input_tokens_seen": 4584720, + "step": 5065 + }, + { + "epoch": 1.3381285469183055, + "grad_norm": 0.005799748934805393, + "learning_rate": 0.28826881802529486, + "loss": 0.1275, + "num_input_tokens_seen": 4589136, + "step": 5070 + }, + { + "epoch": 1.3394483304738023, + "grad_norm": 0.005566427484154701, + "learning_rate": 0.28824597086122705, + "loss": 0.1224, + "num_input_tokens_seen": 4593456, + "step": 5075 + }, + { + "epoch": 1.340768114029299, + "grad_norm": 0.0049245222471654415, + "learning_rate": 0.28822310237789317, + "loss": 0.1026, + "num_input_tokens_seen": 4597936, + "step": 5080 + }, + { + "epoch": 1.342087897584796, + "grad_norm": 0.006236841436475515, + "learning_rate": 0.2882002125788199, + "loss": 0.1271, + "num_input_tokens_seen": 4602256, + "step": 5085 + }, + { + "epoch": 1.343407681140293, + "grad_norm": 0.009819005616009235, + "learning_rate": 0.2881773014675371, + "loss": 0.1643, + "num_input_tokens_seen": 4606960, + "step": 5090 + }, + { + "epoch": 1.3447274646957899, + "grad_norm": 0.004741772077977657, + "learning_rate": 0.288154369047578, + "loss": 0.1606, + "num_input_tokens_seen": 4611760, + "step": 5095 + }, + { + "epoch": 1.3460472482512869, + "grad_norm": 0.0038753061089664698, + "learning_rate": 0.28813141532247905, + "loss": 0.1012, + "num_input_tokens_seen": 4616592, + "step": 5100 + }, + { + "epoch": 1.3473670318067836, + "grad_norm": 0.005063305608928204, + "learning_rate": 0.28810844029578, + "loss": 0.1483, + "num_input_tokens_seen": 4620784, + "step": 5105 + }, + { + "epoch": 1.3486868153622806, + "grad_norm": 0.005523025989532471, + "learning_rate": 0.2880854439710238, + "loss": 0.136, + "num_input_tokens_seen": 4625456, + "step": 5110 + }, + { + "epoch": 1.3500065989177774, + "grad_norm": 0.0075851683504879475, + "learning_rate": 0.28806242635175694, + "loss": 0.097, + "num_input_tokens_seen": 4630224, + "step": 5115 + }, + { + "epoch": 1.3513263824732744, + "grad_norm": 0.006082105450332165, + "learning_rate": 0.2880393874415289, + "loss": 0.1241, + "num_input_tokens_seen": 4634864, + "step": 5120 + }, + { + "epoch": 1.3526461660287712, + "grad_norm": 0.004730699118226767, + "learning_rate": 0.2880163272438926, + "loss": 0.1303, + "num_input_tokens_seen": 4639408, + "step": 5125 + }, + { + "epoch": 1.3539659495842682, + "grad_norm": 0.006545414216816425, + "learning_rate": 0.2879932457624042, + "loss": 0.1115, + "num_input_tokens_seen": 4643984, + "step": 5130 + }, + { + "epoch": 1.3552857331397652, + "grad_norm": 0.004932395648211241, + "learning_rate": 0.2879701430006232, + "loss": 0.1054, + "num_input_tokens_seen": 4648432, + "step": 5135 + }, + { + "epoch": 1.356605516695262, + "grad_norm": 0.005811200477182865, + "learning_rate": 0.28794701896211233, + "loss": 0.0995, + "num_input_tokens_seen": 4653072, + "step": 5140 + }, + { + "epoch": 1.3579253002507587, + "grad_norm": 0.007056174799799919, + "learning_rate": 0.28792387365043753, + "loss": 0.1535, + "num_input_tokens_seen": 4657488, + "step": 5145 + }, + { + "epoch": 1.3592450838062557, + "grad_norm": 0.005356707144528627, + "learning_rate": 0.28790070706916815, + "loss": 0.0988, + "num_input_tokens_seen": 4661968, + "step": 5150 + }, + { + "epoch": 1.3605648673617527, + "grad_norm": 0.005359330680221319, + "learning_rate": 0.2878775192218768, + "loss": 0.1356, + "num_input_tokens_seen": 4666608, + "step": 5155 + }, + { + "epoch": 1.3618846509172495, + "grad_norm": 0.006502909120172262, + "learning_rate": 0.2878543101121393, + "loss": 0.0998, + "num_input_tokens_seen": 4671152, + "step": 5160 + }, + { + "epoch": 1.3632044344727465, + "grad_norm": 0.004497040528804064, + "learning_rate": 0.28783107974353483, + "loss": 0.1154, + "num_input_tokens_seen": 4675440, + "step": 5165 + }, + { + "epoch": 1.3645242180282433, + "grad_norm": 0.007243863306939602, + "learning_rate": 0.2878078281196457, + "loss": 0.1401, + "num_input_tokens_seen": 4680048, + "step": 5170 + }, + { + "epoch": 1.3658440015837403, + "grad_norm": 0.0053054410964250565, + "learning_rate": 0.28778455524405777, + "loss": 0.0963, + "num_input_tokens_seen": 4684432, + "step": 5175 + }, + { + "epoch": 1.367163785139237, + "grad_norm": 0.010718608275055885, + "learning_rate": 0.2877612611203598, + "loss": 0.1443, + "num_input_tokens_seen": 4689200, + "step": 5180 + }, + { + "epoch": 1.368483568694734, + "grad_norm": 0.0078563978895545, + "learning_rate": 0.28773794575214423, + "loss": 0.1401, + "num_input_tokens_seen": 4693904, + "step": 5185 + }, + { + "epoch": 1.3698033522502309, + "grad_norm": 0.005086524877697229, + "learning_rate": 0.28771460914300645, + "loss": 0.1072, + "num_input_tokens_seen": 4698704, + "step": 5190 + }, + { + "epoch": 1.3711231358057279, + "grad_norm": 0.0034000668674707413, + "learning_rate": 0.2876912512965454, + "loss": 0.1049, + "num_input_tokens_seen": 4703472, + "step": 5195 + }, + { + "epoch": 1.3724429193612249, + "grad_norm": 0.007837867364287376, + "learning_rate": 0.287667872216363, + "loss": 0.1116, + "num_input_tokens_seen": 4707952, + "step": 5200 + }, + { + "epoch": 1.3724429193612249, + "eval_loss": 0.13345499336719513, + "eval_runtime": 75.9166, + "eval_samples_per_second": 88.716, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 4707952, + "step": 5200 + }, + { + "epoch": 1.3737627029167216, + "grad_norm": 0.0034495049621909857, + "learning_rate": 0.2876444719060647, + "loss": 0.0914, + "num_input_tokens_seen": 4712464, + "step": 5205 + }, + { + "epoch": 1.3750824864722184, + "grad_norm": 0.0036805137060582638, + "learning_rate": 0.287621050369259, + "loss": 0.1009, + "num_input_tokens_seen": 4716976, + "step": 5210 + }, + { + "epoch": 1.3764022700277154, + "grad_norm": 0.002947188215330243, + "learning_rate": 0.28759760760955794, + "loss": 0.1235, + "num_input_tokens_seen": 4721616, + "step": 5215 + }, + { + "epoch": 1.3777220535832124, + "grad_norm": 0.006393314339220524, + "learning_rate": 0.2875741436305766, + "loss": 0.1289, + "num_input_tokens_seen": 4726000, + "step": 5220 + }, + { + "epoch": 1.3790418371387092, + "grad_norm": 0.0054276916198432446, + "learning_rate": 0.28755065843593347, + "loss": 0.1156, + "num_input_tokens_seen": 4730192, + "step": 5225 + }, + { + "epoch": 1.3803616206942062, + "grad_norm": 0.005358170252293348, + "learning_rate": 0.2875271520292502, + "loss": 0.1716, + "num_input_tokens_seen": 4734736, + "step": 5230 + }, + { + "epoch": 1.381681404249703, + "grad_norm": 0.0023333069402724504, + "learning_rate": 0.28750362441415184, + "loss": 0.1189, + "num_input_tokens_seen": 4739504, + "step": 5235 + }, + { + "epoch": 1.3830011878052, + "grad_norm": 0.004326107446104288, + "learning_rate": 0.28748007559426664, + "loss": 0.1069, + "num_input_tokens_seen": 4744016, + "step": 5240 + }, + { + "epoch": 1.384320971360697, + "grad_norm": 0.004748395178467035, + "learning_rate": 0.2874565055732261, + "loss": 0.1428, + "num_input_tokens_seen": 4748496, + "step": 5245 + }, + { + "epoch": 1.3856407549161938, + "grad_norm": 0.007011250592768192, + "learning_rate": 0.28743291435466495, + "loss": 0.1366, + "num_input_tokens_seen": 4752976, + "step": 5250 + }, + { + "epoch": 1.3869605384716905, + "grad_norm": 0.00410803547129035, + "learning_rate": 0.2874093019422214, + "loss": 0.1246, + "num_input_tokens_seen": 4757616, + "step": 5255 + }, + { + "epoch": 1.3882803220271875, + "grad_norm": 0.0034303395077586174, + "learning_rate": 0.28738566833953666, + "loss": 0.1124, + "num_input_tokens_seen": 4762192, + "step": 5260 + }, + { + "epoch": 1.3896001055826845, + "grad_norm": 0.005327479913830757, + "learning_rate": 0.28736201355025537, + "loss": 0.0967, + "num_input_tokens_seen": 4766608, + "step": 5265 + }, + { + "epoch": 1.3909198891381813, + "grad_norm": 0.0029369592666625977, + "learning_rate": 0.28733833757802535, + "loss": 0.1201, + "num_input_tokens_seen": 4770928, + "step": 5270 + }, + { + "epoch": 1.392239672693678, + "grad_norm": 0.0027849709149450064, + "learning_rate": 0.28731464042649785, + "loss": 0.1027, + "num_input_tokens_seen": 4775408, + "step": 5275 + }, + { + "epoch": 1.393559456249175, + "grad_norm": 0.002889209194108844, + "learning_rate": 0.2872909220993271, + "loss": 0.0804, + "num_input_tokens_seen": 4779952, + "step": 5280 + }, + { + "epoch": 1.394879239804672, + "grad_norm": 0.004873353522270918, + "learning_rate": 0.287267182600171, + "loss": 0.1284, + "num_input_tokens_seen": 4784176, + "step": 5285 + }, + { + "epoch": 1.3961990233601689, + "grad_norm": 0.0064987437799572945, + "learning_rate": 0.2872434219326902, + "loss": 0.1363, + "num_input_tokens_seen": 4788528, + "step": 5290 + }, + { + "epoch": 1.3975188069156659, + "grad_norm": 0.0033241244964301586, + "learning_rate": 0.28721964010054907, + "loss": 0.0961, + "num_input_tokens_seen": 4792816, + "step": 5295 + }, + { + "epoch": 1.3988385904711627, + "grad_norm": 0.0030303695239126682, + "learning_rate": 0.28719583710741503, + "loss": 0.1047, + "num_input_tokens_seen": 4797488, + "step": 5300 + }, + { + "epoch": 1.4001583740266597, + "grad_norm": 0.005134504288434982, + "learning_rate": 0.28717201295695877, + "loss": 0.1097, + "num_input_tokens_seen": 4802000, + "step": 5305 + }, + { + "epoch": 1.4014781575821567, + "grad_norm": 0.007868612185120583, + "learning_rate": 0.28714816765285434, + "loss": 0.1558, + "num_input_tokens_seen": 4806608, + "step": 5310 + }, + { + "epoch": 1.4027979411376534, + "grad_norm": 0.004361919593065977, + "learning_rate": 0.28712430119877896, + "loss": 0.1705, + "num_input_tokens_seen": 4810928, + "step": 5315 + }, + { + "epoch": 1.4041177246931502, + "grad_norm": 0.0045688156969845295, + "learning_rate": 0.28710041359841304, + "loss": 0.1371, + "num_input_tokens_seen": 4815664, + "step": 5320 + }, + { + "epoch": 1.4054375082486472, + "grad_norm": 0.0036298823542892933, + "learning_rate": 0.28707650485544056, + "loss": 0.1257, + "num_input_tokens_seen": 4820368, + "step": 5325 + }, + { + "epoch": 1.4067572918041442, + "grad_norm": 0.00613021245226264, + "learning_rate": 0.28705257497354836, + "loss": 0.1691, + "num_input_tokens_seen": 4824560, + "step": 5330 + }, + { + "epoch": 1.408077075359641, + "grad_norm": 0.0018192896386608481, + "learning_rate": 0.28702862395642675, + "loss": 0.1107, + "num_input_tokens_seen": 4828848, + "step": 5335 + }, + { + "epoch": 1.409396858915138, + "grad_norm": 0.004845349583774805, + "learning_rate": 0.28700465180776935, + "loss": 0.1128, + "num_input_tokens_seen": 4833136, + "step": 5340 + }, + { + "epoch": 1.4107166424706348, + "grad_norm": 0.0076218475587666035, + "learning_rate": 0.2869806585312729, + "loss": 0.1032, + "num_input_tokens_seen": 4837776, + "step": 5345 + }, + { + "epoch": 1.4120364260261318, + "grad_norm": 0.004803601652383804, + "learning_rate": 0.28695664413063754, + "loss": 0.1208, + "num_input_tokens_seen": 4842480, + "step": 5350 + }, + { + "epoch": 1.4133562095816286, + "grad_norm": 0.005508695729076862, + "learning_rate": 0.28693260860956654, + "loss": 0.1039, + "num_input_tokens_seen": 4846896, + "step": 5355 + }, + { + "epoch": 1.4146759931371256, + "grad_norm": 0.00689000403508544, + "learning_rate": 0.2869085519717665, + "loss": 0.1358, + "num_input_tokens_seen": 4851440, + "step": 5360 + }, + { + "epoch": 1.4159957766926223, + "grad_norm": 0.0043087066151201725, + "learning_rate": 0.28688447422094726, + "loss": 0.1429, + "num_input_tokens_seen": 4855856, + "step": 5365 + }, + { + "epoch": 1.4173155602481193, + "grad_norm": 0.004998628981411457, + "learning_rate": 0.2868603753608219, + "loss": 0.0997, + "num_input_tokens_seen": 4860496, + "step": 5370 + }, + { + "epoch": 1.4186353438036163, + "grad_norm": 0.005242071580141783, + "learning_rate": 0.28683625539510665, + "loss": 0.121, + "num_input_tokens_seen": 4864784, + "step": 5375 + }, + { + "epoch": 1.4199551273591131, + "grad_norm": 0.004551622085273266, + "learning_rate": 0.28681211432752135, + "loss": 0.1323, + "num_input_tokens_seen": 4869424, + "step": 5380 + }, + { + "epoch": 1.42127491091461, + "grad_norm": 0.004252149723470211, + "learning_rate": 0.2867879521617887, + "loss": 0.1166, + "num_input_tokens_seen": 4874288, + "step": 5385 + }, + { + "epoch": 1.422594694470107, + "grad_norm": 0.0044906227849423885, + "learning_rate": 0.28676376890163485, + "loss": 0.1108, + "num_input_tokens_seen": 4878800, + "step": 5390 + }, + { + "epoch": 1.423914478025604, + "grad_norm": 0.004656235221773386, + "learning_rate": 0.2867395645507891, + "loss": 0.1564, + "num_input_tokens_seen": 4883376, + "step": 5395 + }, + { + "epoch": 1.4252342615811007, + "grad_norm": 0.005371542181819677, + "learning_rate": 0.2867153391129842, + "loss": 0.1202, + "num_input_tokens_seen": 4887824, + "step": 5400 + }, + { + "epoch": 1.4252342615811007, + "eval_loss": 0.12314480543136597, + "eval_runtime": 75.8109, + "eval_samples_per_second": 88.84, + "eval_steps_per_second": 22.213, + "num_input_tokens_seen": 4887824, + "step": 5400 + }, + { + "epoch": 1.4265540451365977, + "grad_norm": 0.006320716347545385, + "learning_rate": 0.28669109259195585, + "loss": 0.1283, + "num_input_tokens_seen": 4892144, + "step": 5405 + }, + { + "epoch": 1.4278738286920944, + "grad_norm": 0.006617533974349499, + "learning_rate": 0.2866668249914433, + "loss": 0.1651, + "num_input_tokens_seen": 4896464, + "step": 5410 + }, + { + "epoch": 1.4291936122475914, + "grad_norm": 0.006405434105545282, + "learning_rate": 0.2866425363151889, + "loss": 0.1605, + "num_input_tokens_seen": 4900848, + "step": 5415 + }, + { + "epoch": 1.4305133958030882, + "grad_norm": 0.007163924165070057, + "learning_rate": 0.2866182265669382, + "loss": 0.1013, + "num_input_tokens_seen": 4905328, + "step": 5420 + }, + { + "epoch": 1.4318331793585852, + "grad_norm": 0.007256188429892063, + "learning_rate": 0.28659389575044014, + "loss": 0.1396, + "num_input_tokens_seen": 4910000, + "step": 5425 + }, + { + "epoch": 1.433152962914082, + "grad_norm": 0.006785478442907333, + "learning_rate": 0.28656954386944683, + "loss": 0.1418, + "num_input_tokens_seen": 4914864, + "step": 5430 + }, + { + "epoch": 1.434472746469579, + "grad_norm": 0.007839103229343891, + "learning_rate": 0.28654517092771353, + "loss": 0.1478, + "num_input_tokens_seen": 4919376, + "step": 5435 + }, + { + "epoch": 1.435792530025076, + "grad_norm": 0.008518414571881294, + "learning_rate": 0.286520776928999, + "loss": 0.1787, + "num_input_tokens_seen": 4923792, + "step": 5440 + }, + { + "epoch": 1.4371123135805728, + "grad_norm": 0.004518203437328339, + "learning_rate": 0.286496361877065, + "loss": 0.1174, + "num_input_tokens_seen": 4928400, + "step": 5445 + }, + { + "epoch": 1.4384320971360696, + "grad_norm": 0.005558178760111332, + "learning_rate": 0.28647192577567676, + "loss": 0.1226, + "num_input_tokens_seen": 4932752, + "step": 5450 + }, + { + "epoch": 1.4397518806915666, + "grad_norm": 0.005305645056068897, + "learning_rate": 0.28644746862860254, + "loss": 0.1253, + "num_input_tokens_seen": 4937264, + "step": 5455 + }, + { + "epoch": 1.4410716642470636, + "grad_norm": 0.006345751695334911, + "learning_rate": 0.2864229904396139, + "loss": 0.1101, + "num_input_tokens_seen": 4941680, + "step": 5460 + }, + { + "epoch": 1.4423914478025603, + "grad_norm": 0.0019763587042689323, + "learning_rate": 0.28639849121248573, + "loss": 0.103, + "num_input_tokens_seen": 4946096, + "step": 5465 + }, + { + "epoch": 1.4437112313580573, + "grad_norm": 0.007189044263213873, + "learning_rate": 0.28637397095099615, + "loss": 0.1117, + "num_input_tokens_seen": 4950224, + "step": 5470 + }, + { + "epoch": 1.4450310149135541, + "grad_norm": 0.009603499434888363, + "learning_rate": 0.28634942965892646, + "loss": 0.1012, + "num_input_tokens_seen": 4954768, + "step": 5475 + }, + { + "epoch": 1.4463507984690511, + "grad_norm": 0.006515604443848133, + "learning_rate": 0.28632486734006124, + "loss": 0.1427, + "num_input_tokens_seen": 4959376, + "step": 5480 + }, + { + "epoch": 1.447670582024548, + "grad_norm": 0.004549719858914614, + "learning_rate": 0.28630028399818835, + "loss": 0.1462, + "num_input_tokens_seen": 4963696, + "step": 5485 + }, + { + "epoch": 1.448990365580045, + "grad_norm": 0.003903538454324007, + "learning_rate": 0.2862756796370987, + "loss": 0.1176, + "num_input_tokens_seen": 4968144, + "step": 5490 + }, + { + "epoch": 1.4503101491355417, + "grad_norm": 0.007621736265718937, + "learning_rate": 0.2862510542605868, + "loss": 0.1541, + "num_input_tokens_seen": 4972752, + "step": 5495 + }, + { + "epoch": 1.4516299326910387, + "grad_norm": 0.005023064091801643, + "learning_rate": 0.2862264078724501, + "loss": 0.1054, + "num_input_tokens_seen": 4977456, + "step": 5500 + }, + { + "epoch": 1.4529497162465357, + "grad_norm": 0.00472296355292201, + "learning_rate": 0.28620174047648933, + "loss": 0.125, + "num_input_tokens_seen": 4981968, + "step": 5505 + }, + { + "epoch": 1.4542694998020325, + "grad_norm": 0.0038993775378912687, + "learning_rate": 0.2861770520765086, + "loss": 0.1341, + "num_input_tokens_seen": 4986320, + "step": 5510 + }, + { + "epoch": 1.4555892833575292, + "grad_norm": 0.00453393766656518, + "learning_rate": 0.2861523426763151, + "loss": 0.1081, + "num_input_tokens_seen": 4990800, + "step": 5515 + }, + { + "epoch": 1.4569090669130262, + "grad_norm": 0.007610528729856014, + "learning_rate": 0.2861276122797194, + "loss": 0.1084, + "num_input_tokens_seen": 4995472, + "step": 5520 + }, + { + "epoch": 1.4582288504685232, + "grad_norm": 0.0025153399910777807, + "learning_rate": 0.28610286089053516, + "loss": 0.0977, + "num_input_tokens_seen": 5000400, + "step": 5525 + }, + { + "epoch": 1.45954863402402, + "grad_norm": 0.004825440235435963, + "learning_rate": 0.28607808851257943, + "loss": 0.1076, + "num_input_tokens_seen": 5004752, + "step": 5530 + }, + { + "epoch": 1.460868417579517, + "grad_norm": 0.008536904118955135, + "learning_rate": 0.28605329514967237, + "loss": 0.1609, + "num_input_tokens_seen": 5009616, + "step": 5535 + }, + { + "epoch": 1.4621882011350138, + "grad_norm": 0.00748096639290452, + "learning_rate": 0.2860284808056374, + "loss": 0.1283, + "num_input_tokens_seen": 5014192, + "step": 5540 + }, + { + "epoch": 1.4635079846905108, + "grad_norm": 0.002886181464418769, + "learning_rate": 0.28600364548430135, + "loss": 0.1424, + "num_input_tokens_seen": 5018608, + "step": 5545 + }, + { + "epoch": 1.4648277682460076, + "grad_norm": 0.008722193539142609, + "learning_rate": 0.28597878918949393, + "loss": 0.1508, + "num_input_tokens_seen": 5023344, + "step": 5550 + }, + { + "epoch": 1.4661475518015046, + "grad_norm": 0.006218975875526667, + "learning_rate": 0.2859539119250485, + "loss": 0.1279, + "num_input_tokens_seen": 5028080, + "step": 5555 + }, + { + "epoch": 1.4674673353570014, + "grad_norm": 0.006654171273112297, + "learning_rate": 0.2859290136948013, + "loss": 0.1451, + "num_input_tokens_seen": 5032464, + "step": 5560 + }, + { + "epoch": 1.4687871189124984, + "grad_norm": 0.006777998059988022, + "learning_rate": 0.28590409450259197, + "loss": 0.126, + "num_input_tokens_seen": 5037072, + "step": 5565 + }, + { + "epoch": 1.4701069024679954, + "grad_norm": 0.004838022403419018, + "learning_rate": 0.28587915435226346, + "loss": 0.1531, + "num_input_tokens_seen": 5041520, + "step": 5570 + }, + { + "epoch": 1.4714266860234921, + "grad_norm": 0.004602334462106228, + "learning_rate": 0.2858541932476617, + "loss": 0.1382, + "num_input_tokens_seen": 5046032, + "step": 5575 + }, + { + "epoch": 1.472746469578989, + "grad_norm": 0.005087859928607941, + "learning_rate": 0.2858292111926361, + "loss": 0.157, + "num_input_tokens_seen": 5050128, + "step": 5580 + }, + { + "epoch": 1.474066253134486, + "grad_norm": 0.004357889294624329, + "learning_rate": 0.28580420819103924, + "loss": 0.1202, + "num_input_tokens_seen": 5054480, + "step": 5585 + }, + { + "epoch": 1.475386036689983, + "grad_norm": 0.004086626693606377, + "learning_rate": 0.2857791842467269, + "loss": 0.1153, + "num_input_tokens_seen": 5059056, + "step": 5590 + }, + { + "epoch": 1.4767058202454797, + "grad_norm": 0.003605692880228162, + "learning_rate": 0.2857541393635579, + "loss": 0.1373, + "num_input_tokens_seen": 5063760, + "step": 5595 + }, + { + "epoch": 1.4780256038009767, + "grad_norm": 0.004422806669026613, + "learning_rate": 0.2857290735453948, + "loss": 0.1279, + "num_input_tokens_seen": 5068368, + "step": 5600 + }, + { + "epoch": 1.4780256038009767, + "eval_loss": 0.1258571743965149, + "eval_runtime": 75.8455, + "eval_samples_per_second": 88.799, + "eval_steps_per_second": 22.203, + "num_input_tokens_seen": 5068368, + "step": 5600 + }, + { + "epoch": 1.4793453873564735, + "grad_norm": 0.003697659820318222, + "learning_rate": 0.28570398679610276, + "loss": 0.0951, + "num_input_tokens_seen": 5072720, + "step": 5605 + }, + { + "epoch": 1.4806651709119705, + "grad_norm": 0.006315284408628941, + "learning_rate": 0.2856788791195506, + "loss": 0.1118, + "num_input_tokens_seen": 5077200, + "step": 5610 + }, + { + "epoch": 1.4819849544674673, + "grad_norm": 0.004396041855216026, + "learning_rate": 0.28565375051961023, + "loss": 0.1365, + "num_input_tokens_seen": 5081616, + "step": 5615 + }, + { + "epoch": 1.4833047380229643, + "grad_norm": 0.005848131608217955, + "learning_rate": 0.28562860100015686, + "loss": 0.1212, + "num_input_tokens_seen": 5086064, + "step": 5620 + }, + { + "epoch": 1.484624521578461, + "grad_norm": 0.005573166534304619, + "learning_rate": 0.2856034305650687, + "loss": 0.1248, + "num_input_tokens_seen": 5090768, + "step": 5625 + }, + { + "epoch": 1.485944305133958, + "grad_norm": 0.004908713977783918, + "learning_rate": 0.28557823921822756, + "loss": 0.0936, + "num_input_tokens_seen": 5095696, + "step": 5630 + }, + { + "epoch": 1.487264088689455, + "grad_norm": 0.004205943550914526, + "learning_rate": 0.2855530269635181, + "loss": 0.0854, + "num_input_tokens_seen": 5100048, + "step": 5635 + }, + { + "epoch": 1.4885838722449518, + "grad_norm": 0.005804220214486122, + "learning_rate": 0.2855277938048284, + "loss": 0.1146, + "num_input_tokens_seen": 5104400, + "step": 5640 + }, + { + "epoch": 1.4899036558004486, + "grad_norm": 0.004194050095975399, + "learning_rate": 0.2855025397460498, + "loss": 0.0901, + "num_input_tokens_seen": 5109008, + "step": 5645 + }, + { + "epoch": 1.4912234393559456, + "grad_norm": 0.00745352590456605, + "learning_rate": 0.28547726479107666, + "loss": 0.1259, + "num_input_tokens_seen": 5113680, + "step": 5650 + }, + { + "epoch": 1.4925432229114426, + "grad_norm": 0.003796046134084463, + "learning_rate": 0.2854519689438068, + "loss": 0.1143, + "num_input_tokens_seen": 5118288, + "step": 5655 + }, + { + "epoch": 1.4938630064669394, + "grad_norm": 0.006838208995759487, + "learning_rate": 0.2854266522081412, + "loss": 0.1232, + "num_input_tokens_seen": 5123024, + "step": 5660 + }, + { + "epoch": 1.4951827900224364, + "grad_norm": 0.0034987099934369326, + "learning_rate": 0.28540131458798385, + "loss": 0.0883, + "num_input_tokens_seen": 5127568, + "step": 5665 + }, + { + "epoch": 1.4965025735779331, + "grad_norm": 0.007083676289767027, + "learning_rate": 0.28537595608724226, + "loss": 0.1203, + "num_input_tokens_seen": 5132016, + "step": 5670 + }, + { + "epoch": 1.4978223571334301, + "grad_norm": 0.0037806315813213587, + "learning_rate": 0.28535057670982705, + "loss": 0.1036, + "num_input_tokens_seen": 5136656, + "step": 5675 + }, + { + "epoch": 1.4991421406889271, + "grad_norm": 0.003533110721036792, + "learning_rate": 0.285325176459652, + "loss": 0.111, + "num_input_tokens_seen": 5141136, + "step": 5680 + }, + { + "epoch": 1.500461924244424, + "grad_norm": 0.005235917400568724, + "learning_rate": 0.28529975534063406, + "loss": 0.1177, + "num_input_tokens_seen": 5145616, + "step": 5685 + }, + { + "epoch": 1.5017817077999207, + "grad_norm": 0.004053886979818344, + "learning_rate": 0.2852743133566936, + "loss": 0.1337, + "num_input_tokens_seen": 5150448, + "step": 5690 + }, + { + "epoch": 1.5031014913554177, + "grad_norm": 0.002230136888101697, + "learning_rate": 0.2852488505117541, + "loss": 0.1066, + "num_input_tokens_seen": 5155152, + "step": 5695 + }, + { + "epoch": 1.5044212749109147, + "grad_norm": 0.004391312599182129, + "learning_rate": 0.28522336680974214, + "loss": 0.0989, + "num_input_tokens_seen": 5159824, + "step": 5700 + }, + { + "epoch": 1.5057410584664115, + "grad_norm": 0.004384173080325127, + "learning_rate": 0.2851978622545877, + "loss": 0.0917, + "num_input_tokens_seen": 5164272, + "step": 5705 + }, + { + "epoch": 1.5070608420219083, + "grad_norm": 0.004126288928091526, + "learning_rate": 0.285172336850224, + "loss": 0.1266, + "num_input_tokens_seen": 5168656, + "step": 5710 + }, + { + "epoch": 1.5083806255774053, + "grad_norm": 0.0053826323710381985, + "learning_rate": 0.2851467906005871, + "loss": 0.0922, + "num_input_tokens_seen": 5173392, + "step": 5715 + }, + { + "epoch": 1.5097004091329023, + "grad_norm": 0.006306441500782967, + "learning_rate": 0.28512122350961683, + "loss": 0.132, + "num_input_tokens_seen": 5177872, + "step": 5720 + }, + { + "epoch": 1.5110201926883993, + "grad_norm": 0.006877110339701176, + "learning_rate": 0.2850956355812559, + "loss": 0.0972, + "num_input_tokens_seen": 5182352, + "step": 5725 + }, + { + "epoch": 1.512339976243896, + "grad_norm": 0.0030152187682688236, + "learning_rate": 0.28507002681945015, + "loss": 0.1184, + "num_input_tokens_seen": 5186704, + "step": 5730 + }, + { + "epoch": 1.5136597597993928, + "grad_norm": 0.004878505598753691, + "learning_rate": 0.28504439722814895, + "loss": 0.137, + "num_input_tokens_seen": 5191504, + "step": 5735 + }, + { + "epoch": 1.5149795433548898, + "grad_norm": 0.002389514585956931, + "learning_rate": 0.28501874681130457, + "loss": 0.1277, + "num_input_tokens_seen": 5195856, + "step": 5740 + }, + { + "epoch": 1.5162993269103868, + "grad_norm": 0.005396494176238775, + "learning_rate": 0.2849930755728727, + "loss": 0.1425, + "num_input_tokens_seen": 5200464, + "step": 5745 + }, + { + "epoch": 1.5176191104658836, + "grad_norm": 0.004917625803500414, + "learning_rate": 0.28496738351681217, + "loss": 0.1289, + "num_input_tokens_seen": 5205136, + "step": 5750 + }, + { + "epoch": 1.5189388940213804, + "grad_norm": 0.006807031109929085, + "learning_rate": 0.284941670647085, + "loss": 0.145, + "num_input_tokens_seen": 5209840, + "step": 5755 + }, + { + "epoch": 1.5202586775768774, + "grad_norm": 0.006136585026979446, + "learning_rate": 0.2849159369676563, + "loss": 0.1308, + "num_input_tokens_seen": 5214384, + "step": 5760 + }, + { + "epoch": 1.5215784611323744, + "grad_norm": 0.004041978623718023, + "learning_rate": 0.2848901824824948, + "loss": 0.1353, + "num_input_tokens_seen": 5218928, + "step": 5765 + }, + { + "epoch": 1.5228982446878712, + "grad_norm": 0.002819858258590102, + "learning_rate": 0.284864407195572, + "loss": 0.1216, + "num_input_tokens_seen": 5223696, + "step": 5770 + }, + { + "epoch": 1.524218028243368, + "grad_norm": 0.0025203272234648466, + "learning_rate": 0.28483861111086284, + "loss": 0.1206, + "num_input_tokens_seen": 5228304, + "step": 5775 + }, + { + "epoch": 1.525537811798865, + "grad_norm": 0.00516548240557313, + "learning_rate": 0.2848127942323453, + "loss": 0.0981, + "num_input_tokens_seen": 5232912, + "step": 5780 + }, + { + "epoch": 1.526857595354362, + "grad_norm": 0.0038324277848005295, + "learning_rate": 0.2847869565640007, + "loss": 0.0891, + "num_input_tokens_seen": 5237360, + "step": 5785 + }, + { + "epoch": 1.528177378909859, + "grad_norm": 0.0050729019567370415, + "learning_rate": 0.2847610981098136, + "loss": 0.0932, + "num_input_tokens_seen": 5241872, + "step": 5790 + }, + { + "epoch": 1.5294971624653557, + "grad_norm": 0.0016377761494368315, + "learning_rate": 0.2847352188737716, + "loss": 0.0953, + "num_input_tokens_seen": 5246192, + "step": 5795 + }, + { + "epoch": 1.5308169460208525, + "grad_norm": 0.006088503170758486, + "learning_rate": 0.2847093188598658, + "loss": 0.1274, + "num_input_tokens_seen": 5250800, + "step": 5800 + }, + { + "epoch": 1.5308169460208525, + "eval_loss": 0.11869790405035019, + "eval_runtime": 75.7555, + "eval_samples_per_second": 88.904, + "eval_steps_per_second": 22.229, + "num_input_tokens_seen": 5250800, + "step": 5800 + }, + { + "epoch": 1.5321367295763495, + "grad_norm": 0.00802867952734232, + "learning_rate": 0.28468339807209003, + "loss": 0.1277, + "num_input_tokens_seen": 5255280, + "step": 5805 + }, + { + "epoch": 1.5334565131318465, + "grad_norm": 0.004883505403995514, + "learning_rate": 0.2846574565144418, + "loss": 0.121, + "num_input_tokens_seen": 5259984, + "step": 5810 + }, + { + "epoch": 1.5347762966873433, + "grad_norm": 0.007255645003169775, + "learning_rate": 0.28463149419092154, + "loss": 0.0936, + "num_input_tokens_seen": 5264944, + "step": 5815 + }, + { + "epoch": 1.53609608024284, + "grad_norm": 0.004478064831346273, + "learning_rate": 0.284605511105533, + "loss": 0.137, + "num_input_tokens_seen": 5269584, + "step": 5820 + }, + { + "epoch": 1.537415863798337, + "grad_norm": 0.0031889479141682386, + "learning_rate": 0.28457950726228315, + "loss": 0.0874, + "num_input_tokens_seen": 5274000, + "step": 5825 + }, + { + "epoch": 1.538735647353834, + "grad_norm": 0.0018409447511658072, + "learning_rate": 0.28455348266518193, + "loss": 0.0981, + "num_input_tokens_seen": 5278512, + "step": 5830 + }, + { + "epoch": 1.5400554309093308, + "grad_norm": 0.0032098200172185898, + "learning_rate": 0.28452743731824287, + "loss": 0.109, + "num_input_tokens_seen": 5282736, + "step": 5835 + }, + { + "epoch": 1.5413752144648276, + "grad_norm": 0.0033149162773042917, + "learning_rate": 0.28450137122548236, + "loss": 0.0821, + "num_input_tokens_seen": 5287024, + "step": 5840 + }, + { + "epoch": 1.5426949980203246, + "grad_norm": 0.004802994895726442, + "learning_rate": 0.2844752843909201, + "loss": 0.1105, + "num_input_tokens_seen": 5291760, + "step": 5845 + }, + { + "epoch": 1.5440147815758216, + "grad_norm": 0.00606977054849267, + "learning_rate": 0.28444917681857923, + "loss": 0.0984, + "num_input_tokens_seen": 5296048, + "step": 5850 + }, + { + "epoch": 1.5453345651313186, + "grad_norm": 0.00598618621006608, + "learning_rate": 0.28442304851248557, + "loss": 0.0762, + "num_input_tokens_seen": 5300528, + "step": 5855 + }, + { + "epoch": 1.5466543486868154, + "grad_norm": 0.0063447835855185986, + "learning_rate": 0.2843968994766686, + "loss": 0.1258, + "num_input_tokens_seen": 5305072, + "step": 5860 + }, + { + "epoch": 1.5479741322423122, + "grad_norm": 0.00672887684777379, + "learning_rate": 0.28437072971516075, + "loss": 0.1032, + "num_input_tokens_seen": 5309680, + "step": 5865 + }, + { + "epoch": 1.5492939157978092, + "grad_norm": 0.005954741034656763, + "learning_rate": 0.2843445392319979, + "loss": 0.1258, + "num_input_tokens_seen": 5313968, + "step": 5870 + }, + { + "epoch": 1.5506136993533062, + "grad_norm": 0.004004610702395439, + "learning_rate": 0.28431832803121865, + "loss": 0.0971, + "num_input_tokens_seen": 5318320, + "step": 5875 + }, + { + "epoch": 1.551933482908803, + "grad_norm": 0.005585811100900173, + "learning_rate": 0.28429209611686534, + "loss": 0.1276, + "num_input_tokens_seen": 5322544, + "step": 5880 + }, + { + "epoch": 1.5532532664642997, + "grad_norm": 0.002713495632633567, + "learning_rate": 0.28426584349298323, + "loss": 0.104, + "num_input_tokens_seen": 5326960, + "step": 5885 + }, + { + "epoch": 1.5545730500197967, + "grad_norm": 0.0035468756686896086, + "learning_rate": 0.2842395701636207, + "loss": 0.0727, + "num_input_tokens_seen": 5331760, + "step": 5890 + }, + { + "epoch": 1.5558928335752937, + "grad_norm": 0.004893211647868156, + "learning_rate": 0.28421327613282954, + "loss": 0.1023, + "num_input_tokens_seen": 5336368, + "step": 5895 + }, + { + "epoch": 1.5572126171307905, + "grad_norm": 0.0026944875717163086, + "learning_rate": 0.28418696140466454, + "loss": 0.0765, + "num_input_tokens_seen": 5341008, + "step": 5900 + }, + { + "epoch": 1.5585324006862873, + "grad_norm": 0.007490907795727253, + "learning_rate": 0.2841606259831838, + "loss": 0.1746, + "num_input_tokens_seen": 5345488, + "step": 5905 + }, + { + "epoch": 1.5598521842417843, + "grad_norm": 0.009255079552531242, + "learning_rate": 0.2841342698724486, + "loss": 0.1308, + "num_input_tokens_seen": 5349936, + "step": 5910 + }, + { + "epoch": 1.5611719677972813, + "grad_norm": 0.0037901601754128933, + "learning_rate": 0.28410789307652334, + "loss": 0.1632, + "num_input_tokens_seen": 5354384, + "step": 5915 + }, + { + "epoch": 1.5624917513527783, + "grad_norm": 0.003471923992037773, + "learning_rate": 0.2840814955994756, + "loss": 0.1088, + "num_input_tokens_seen": 5358640, + "step": 5920 + }, + { + "epoch": 1.563811534908275, + "grad_norm": 0.0025094517040997744, + "learning_rate": 0.2840550774453763, + "loss": 0.0926, + "num_input_tokens_seen": 5363248, + "step": 5925 + }, + { + "epoch": 1.5651313184637718, + "grad_norm": 0.006322829518467188, + "learning_rate": 0.28402863861829947, + "loss": 0.1142, + "num_input_tokens_seen": 5367632, + "step": 5930 + }, + { + "epoch": 1.5664511020192688, + "grad_norm": 0.007687756791710854, + "learning_rate": 0.2840021791223222, + "loss": 0.1019, + "num_input_tokens_seen": 5372336, + "step": 5935 + }, + { + "epoch": 1.5677708855747658, + "grad_norm": 0.001480769133195281, + "learning_rate": 0.2839756989615249, + "loss": 0.0547, + "num_input_tokens_seen": 5376656, + "step": 5940 + }, + { + "epoch": 1.5690906691302626, + "grad_norm": 0.00530625693500042, + "learning_rate": 0.28394919813999125, + "loss": 0.1021, + "num_input_tokens_seen": 5381392, + "step": 5945 + }, + { + "epoch": 1.5704104526857594, + "grad_norm": 0.006609358359128237, + "learning_rate": 0.28392267666180787, + "loss": 0.0614, + "num_input_tokens_seen": 5386096, + "step": 5950 + }, + { + "epoch": 1.5717302362412564, + "grad_norm": 0.009153465740382671, + "learning_rate": 0.2838961345310648, + "loss": 0.1323, + "num_input_tokens_seen": 5390384, + "step": 5955 + }, + { + "epoch": 1.5730500197967534, + "grad_norm": 0.0035736998543143272, + "learning_rate": 0.2838695717518552, + "loss": 0.1165, + "num_input_tokens_seen": 5394896, + "step": 5960 + }, + { + "epoch": 1.5743698033522502, + "grad_norm": 0.009243722073733807, + "learning_rate": 0.28384298832827526, + "loss": 0.1043, + "num_input_tokens_seen": 5399632, + "step": 5965 + }, + { + "epoch": 1.575689586907747, + "grad_norm": 0.0030154252890497446, + "learning_rate": 0.28381638426442457, + "loss": 0.1565, + "num_input_tokens_seen": 5404176, + "step": 5970 + }, + { + "epoch": 1.577009370463244, + "grad_norm": 0.0034737777896225452, + "learning_rate": 0.2837897595644057, + "loss": 0.1196, + "num_input_tokens_seen": 5408976, + "step": 5975 + }, + { + "epoch": 1.578329154018741, + "grad_norm": 0.006657496094703674, + "learning_rate": 0.28376311423232475, + "loss": 0.1218, + "num_input_tokens_seen": 5413456, + "step": 5980 + }, + { + "epoch": 1.579648937574238, + "grad_norm": 0.003851131536066532, + "learning_rate": 0.2837364482722905, + "loss": 0.1291, + "num_input_tokens_seen": 5418032, + "step": 5985 + }, + { + "epoch": 1.5809687211297347, + "grad_norm": 0.00432856660336256, + "learning_rate": 0.28370976168841533, + "loss": 0.1171, + "num_input_tokens_seen": 5422512, + "step": 5990 + }, + { + "epoch": 1.5822885046852315, + "grad_norm": 0.005541198421269655, + "learning_rate": 0.2836830544848146, + "loss": 0.089, + "num_input_tokens_seen": 5427216, + "step": 5995 + }, + { + "epoch": 1.5836082882407285, + "grad_norm": 0.005022334400564432, + "learning_rate": 0.2836563266656069, + "loss": 0.1038, + "num_input_tokens_seen": 5431440, + "step": 6000 + }, + { + "epoch": 1.5836082882407285, + "eval_loss": 0.1196097880601883, + "eval_runtime": 76.0092, + "eval_samples_per_second": 88.608, + "eval_steps_per_second": 22.155, + "num_input_tokens_seen": 5431440, + "step": 6000 + }, + { + "epoch": 1.5849280717962255, + "grad_norm": 0.003944274503737688, + "learning_rate": 0.283629578234914, + "loss": 0.128, + "num_input_tokens_seen": 5435728, + "step": 6005 + }, + { + "epoch": 1.5862478553517223, + "grad_norm": 0.004610151518136263, + "learning_rate": 0.2836028091968608, + "loss": 0.1077, + "num_input_tokens_seen": 5440368, + "step": 6010 + }, + { + "epoch": 1.587567638907219, + "grad_norm": 0.006802560295909643, + "learning_rate": 0.28357601955557554, + "loss": 0.1283, + "num_input_tokens_seen": 5444720, + "step": 6015 + }, + { + "epoch": 1.588887422462716, + "grad_norm": 0.004576560575515032, + "learning_rate": 0.2835492093151894, + "loss": 0.1497, + "num_input_tokens_seen": 5449168, + "step": 6020 + }, + { + "epoch": 1.590207206018213, + "grad_norm": 0.0040879929438233376, + "learning_rate": 0.2835223784798369, + "loss": 0.1211, + "num_input_tokens_seen": 5453616, + "step": 6025 + }, + { + "epoch": 1.5915269895737099, + "grad_norm": 0.005435646511614323, + "learning_rate": 0.2834955270536557, + "loss": 0.1438, + "num_input_tokens_seen": 5457968, + "step": 6030 + }, + { + "epoch": 1.5928467731292069, + "grad_norm": 0.004297749139368534, + "learning_rate": 0.2834686550407866, + "loss": 0.1656, + "num_input_tokens_seen": 5462768, + "step": 6035 + }, + { + "epoch": 1.5941665566847036, + "grad_norm": 0.004091786686331034, + "learning_rate": 0.28344176244537367, + "loss": 0.1393, + "num_input_tokens_seen": 5467248, + "step": 6040 + }, + { + "epoch": 1.5954863402402006, + "grad_norm": 0.003518487326800823, + "learning_rate": 0.28341484927156396, + "loss": 0.1265, + "num_input_tokens_seen": 5471792, + "step": 6045 + }, + { + "epoch": 1.5968061237956976, + "grad_norm": 0.005593572277575731, + "learning_rate": 0.28338791552350795, + "loss": 0.148, + "num_input_tokens_seen": 5476368, + "step": 6050 + }, + { + "epoch": 1.5981259073511944, + "grad_norm": 0.005490721669048071, + "learning_rate": 0.28336096120535914, + "loss": 0.166, + "num_input_tokens_seen": 5480848, + "step": 6055 + }, + { + "epoch": 1.5994456909066912, + "grad_norm": 0.006307526491582394, + "learning_rate": 0.2833339863212741, + "loss": 0.1122, + "num_input_tokens_seen": 5485488, + "step": 6060 + }, + { + "epoch": 1.6007654744621882, + "grad_norm": 0.004959978628903627, + "learning_rate": 0.28330699087541283, + "loss": 0.0903, + "num_input_tokens_seen": 5489968, + "step": 6065 + }, + { + "epoch": 1.6020852580176852, + "grad_norm": 0.00552026554942131, + "learning_rate": 0.2832799748719384, + "loss": 0.1616, + "num_input_tokens_seen": 5494608, + "step": 6070 + }, + { + "epoch": 1.603405041573182, + "grad_norm": 0.006449808832257986, + "learning_rate": 0.28325293831501686, + "loss": 0.1312, + "num_input_tokens_seen": 5499216, + "step": 6075 + }, + { + "epoch": 1.6047248251286788, + "grad_norm": 0.0033152394462376833, + "learning_rate": 0.2832258812088177, + "loss": 0.0962, + "num_input_tokens_seen": 5503600, + "step": 6080 + }, + { + "epoch": 1.6060446086841758, + "grad_norm": 0.004435498267412186, + "learning_rate": 0.2831988035575134, + "loss": 0.0937, + "num_input_tokens_seen": 5507824, + "step": 6085 + }, + { + "epoch": 1.6073643922396728, + "grad_norm": 0.008165810257196426, + "learning_rate": 0.28317170536527975, + "loss": 0.1277, + "num_input_tokens_seen": 5512208, + "step": 6090 + }, + { + "epoch": 1.6086841757951698, + "grad_norm": 0.003959451336413622, + "learning_rate": 0.2831445866362956, + "loss": 0.0927, + "num_input_tokens_seen": 5516720, + "step": 6095 + }, + { + "epoch": 1.6100039593506665, + "grad_norm": 0.004431739915162325, + "learning_rate": 0.2831174473747429, + "loss": 0.1275, + "num_input_tokens_seen": 5521328, + "step": 6100 + }, + { + "epoch": 1.6113237429061633, + "grad_norm": 0.007197056896984577, + "learning_rate": 0.2830902875848071, + "loss": 0.1154, + "num_input_tokens_seen": 5525712, + "step": 6105 + }, + { + "epoch": 1.6126435264616603, + "grad_norm": 0.0038966448046267033, + "learning_rate": 0.28306310727067635, + "loss": 0.0753, + "num_input_tokens_seen": 5530608, + "step": 6110 + }, + { + "epoch": 1.6139633100171573, + "grad_norm": 0.006587582640349865, + "learning_rate": 0.2830359064365423, + "loss": 0.1102, + "num_input_tokens_seen": 5534896, + "step": 6115 + }, + { + "epoch": 1.615283093572654, + "grad_norm": 0.004991055000573397, + "learning_rate": 0.28300868508659965, + "loss": 0.1326, + "num_input_tokens_seen": 5539408, + "step": 6120 + }, + { + "epoch": 1.6166028771281509, + "grad_norm": 0.005191831849515438, + "learning_rate": 0.28298144322504626, + "loss": 0.1153, + "num_input_tokens_seen": 5543568, + "step": 6125 + }, + { + "epoch": 1.6179226606836479, + "grad_norm": 0.004845436196774244, + "learning_rate": 0.2829541808560832, + "loss": 0.1471, + "num_input_tokens_seen": 5548240, + "step": 6130 + }, + { + "epoch": 1.6192424442391449, + "grad_norm": 0.00598903326317668, + "learning_rate": 0.2829268979839146, + "loss": 0.1105, + "num_input_tokens_seen": 5552464, + "step": 6135 + }, + { + "epoch": 1.6205622277946417, + "grad_norm": 0.002018817001953721, + "learning_rate": 0.2828995946127479, + "loss": 0.084, + "num_input_tokens_seen": 5557264, + "step": 6140 + }, + { + "epoch": 1.6218820113501384, + "grad_norm": 0.005883863661438227, + "learning_rate": 0.2828722707467936, + "loss": 0.1361, + "num_input_tokens_seen": 5561648, + "step": 6145 + }, + { + "epoch": 1.6232017949056354, + "grad_norm": 0.003041384508833289, + "learning_rate": 0.2828449263902653, + "loss": 0.1107, + "num_input_tokens_seen": 5566128, + "step": 6150 + }, + { + "epoch": 1.6245215784611324, + "grad_norm": 0.004267403390258551, + "learning_rate": 0.28281756154738, + "loss": 0.1348, + "num_input_tokens_seen": 5570832, + "step": 6155 + }, + { + "epoch": 1.6258413620166294, + "grad_norm": 0.0031499939505010843, + "learning_rate": 0.28279017622235764, + "loss": 0.1231, + "num_input_tokens_seen": 5575344, + "step": 6160 + }, + { + "epoch": 1.6271611455721262, + "grad_norm": 0.003335430985316634, + "learning_rate": 0.28276277041942127, + "loss": 0.1213, + "num_input_tokens_seen": 5579696, + "step": 6165 + }, + { + "epoch": 1.628480929127623, + "grad_norm": 0.002339520724490285, + "learning_rate": 0.2827353441427974, + "loss": 0.1408, + "num_input_tokens_seen": 5584272, + "step": 6170 + }, + { + "epoch": 1.62980071268312, + "grad_norm": 0.003467801958322525, + "learning_rate": 0.2827078973967153, + "loss": 0.118, + "num_input_tokens_seen": 5588912, + "step": 6175 + }, + { + "epoch": 1.631120496238617, + "grad_norm": 0.004038172774016857, + "learning_rate": 0.2826804301854078, + "loss": 0.1235, + "num_input_tokens_seen": 5593584, + "step": 6180 + }, + { + "epoch": 1.6324402797941138, + "grad_norm": 0.004467231687158346, + "learning_rate": 0.2826529425131105, + "loss": 0.1075, + "num_input_tokens_seen": 5598032, + "step": 6185 + }, + { + "epoch": 1.6337600633496105, + "grad_norm": 0.005421272478997707, + "learning_rate": 0.2826254343840625, + "loss": 0.1379, + "num_input_tokens_seen": 5602288, + "step": 6190 + }, + { + "epoch": 1.6350798469051075, + "grad_norm": 0.004284044727683067, + "learning_rate": 0.2825979058025059, + "loss": 0.1083, + "num_input_tokens_seen": 5606768, + "step": 6195 + }, + { + "epoch": 1.6363996304606045, + "grad_norm": 0.006837638560682535, + "learning_rate": 0.2825703567726858, + "loss": 0.1364, + "num_input_tokens_seen": 5611184, + "step": 6200 + }, + { + "epoch": 1.6363996304606045, + "eval_loss": 0.1235659271478653, + "eval_runtime": 75.9107, + "eval_samples_per_second": 88.723, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 5611184, + "step": 6200 + }, + { + "epoch": 1.6377194140161013, + "grad_norm": 0.0045074704103171825, + "learning_rate": 0.2825427872988508, + "loss": 0.1337, + "num_input_tokens_seen": 5615696, + "step": 6205 + }, + { + "epoch": 1.639039197571598, + "grad_norm": 0.002658196957781911, + "learning_rate": 0.28251519738525227, + "loss": 0.1028, + "num_input_tokens_seen": 5620208, + "step": 6210 + }, + { + "epoch": 1.640358981127095, + "grad_norm": 0.003643004223704338, + "learning_rate": 0.28248758703614507, + "loss": 0.1051, + "num_input_tokens_seen": 5624720, + "step": 6215 + }, + { + "epoch": 1.641678764682592, + "grad_norm": 0.004092008341103792, + "learning_rate": 0.28245995625578696, + "loss": 0.1516, + "num_input_tokens_seen": 5629008, + "step": 6220 + }, + { + "epoch": 1.642998548238089, + "grad_norm": 0.004655254539102316, + "learning_rate": 0.282432305048439, + "loss": 0.1071, + "num_input_tokens_seen": 5633296, + "step": 6225 + }, + { + "epoch": 1.6443183317935859, + "grad_norm": 0.005790372379124165, + "learning_rate": 0.28240463341836536, + "loss": 0.1454, + "num_input_tokens_seen": 5638000, + "step": 6230 + }, + { + "epoch": 1.6456381153490827, + "grad_norm": 0.0024641109630465508, + "learning_rate": 0.2823769413698334, + "loss": 0.1156, + "num_input_tokens_seen": 5642288, + "step": 6235 + }, + { + "epoch": 1.6469578989045797, + "grad_norm": 0.0035246326588094234, + "learning_rate": 0.2823492289071135, + "loss": 0.1072, + "num_input_tokens_seen": 5646768, + "step": 6240 + }, + { + "epoch": 1.6482776824600767, + "grad_norm": 0.00351124769076705, + "learning_rate": 0.2823214960344793, + "loss": 0.1441, + "num_input_tokens_seen": 5651536, + "step": 6245 + }, + { + "epoch": 1.6495974660155734, + "grad_norm": 0.007719897199422121, + "learning_rate": 0.28229374275620756, + "loss": 0.1784, + "num_input_tokens_seen": 5656112, + "step": 6250 + }, + { + "epoch": 1.6509172495710702, + "grad_norm": 0.004176015499979258, + "learning_rate": 0.28226596907657814, + "loss": 0.1397, + "num_input_tokens_seen": 5660784, + "step": 6255 + }, + { + "epoch": 1.6522370331265672, + "grad_norm": 0.0053043910302221775, + "learning_rate": 0.28223817499987414, + "loss": 0.1713, + "num_input_tokens_seen": 5665456, + "step": 6260 + }, + { + "epoch": 1.6535568166820642, + "grad_norm": 0.005445965565741062, + "learning_rate": 0.2822103605303818, + "loss": 0.1148, + "num_input_tokens_seen": 5670128, + "step": 6265 + }, + { + "epoch": 1.654876600237561, + "grad_norm": 0.0027238959446549416, + "learning_rate": 0.2821825256723903, + "loss": 0.1133, + "num_input_tokens_seen": 5674768, + "step": 6270 + }, + { + "epoch": 1.6561963837930578, + "grad_norm": 0.005506027955561876, + "learning_rate": 0.2821546704301923, + "loss": 0.1171, + "num_input_tokens_seen": 5679376, + "step": 6275 + }, + { + "epoch": 1.6575161673485548, + "grad_norm": 0.0017371505964547396, + "learning_rate": 0.2821267948080834, + "loss": 0.0959, + "num_input_tokens_seen": 5684048, + "step": 6280 + }, + { + "epoch": 1.6588359509040518, + "grad_norm": 0.005831413436681032, + "learning_rate": 0.28209889881036226, + "loss": 0.1059, + "num_input_tokens_seen": 5688432, + "step": 6285 + }, + { + "epoch": 1.6601557344595488, + "grad_norm": 0.009090334177017212, + "learning_rate": 0.28207098244133094, + "loss": 0.1474, + "num_input_tokens_seen": 5693264, + "step": 6290 + }, + { + "epoch": 1.6614755180150456, + "grad_norm": 0.004045999143272638, + "learning_rate": 0.2820430457052943, + "loss": 0.1016, + "num_input_tokens_seen": 5697840, + "step": 6295 + }, + { + "epoch": 1.6627953015705423, + "grad_norm": 0.004818873014301062, + "learning_rate": 0.28201508860656077, + "loss": 0.1046, + "num_input_tokens_seen": 5702320, + "step": 6300 + }, + { + "epoch": 1.6641150851260393, + "grad_norm": 0.005859986878931522, + "learning_rate": 0.2819871111494415, + "loss": 0.1545, + "num_input_tokens_seen": 5706960, + "step": 6305 + }, + { + "epoch": 1.6654348686815363, + "grad_norm": 0.005431707948446274, + "learning_rate": 0.28195911333825113, + "loss": 0.2193, + "num_input_tokens_seen": 5711536, + "step": 6310 + }, + { + "epoch": 1.6667546522370331, + "grad_norm": 0.005583788268268108, + "learning_rate": 0.28193109517730713, + "loss": 0.135, + "num_input_tokens_seen": 5716080, + "step": 6315 + }, + { + "epoch": 1.66807443579253, + "grad_norm": 0.004737510345876217, + "learning_rate": 0.2819030566709303, + "loss": 0.1995, + "num_input_tokens_seen": 5720624, + "step": 6320 + }, + { + "epoch": 1.669394219348027, + "grad_norm": 0.0036079350393265486, + "learning_rate": 0.2818749978234445, + "loss": 0.1461, + "num_input_tokens_seen": 5725232, + "step": 6325 + }, + { + "epoch": 1.670714002903524, + "grad_norm": 0.002344125183299184, + "learning_rate": 0.2818469186391768, + "loss": 0.1369, + "num_input_tokens_seen": 5729840, + "step": 6330 + }, + { + "epoch": 1.6720337864590207, + "grad_norm": 0.003117927582934499, + "learning_rate": 0.28181881912245743, + "loss": 0.1101, + "num_input_tokens_seen": 5734288, + "step": 6335 + }, + { + "epoch": 1.6733535700145175, + "grad_norm": 0.0049718935042619705, + "learning_rate": 0.2817906992776195, + "loss": 0.1326, + "num_input_tokens_seen": 5738864, + "step": 6340 + }, + { + "epoch": 1.6746733535700145, + "grad_norm": 0.004550440702587366, + "learning_rate": 0.28176255910899967, + "loss": 0.1321, + "num_input_tokens_seen": 5743216, + "step": 6345 + }, + { + "epoch": 1.6759931371255115, + "grad_norm": 0.0058221397921442986, + "learning_rate": 0.2817343986209373, + "loss": 0.124, + "num_input_tokens_seen": 5747568, + "step": 6350 + }, + { + "epoch": 1.6773129206810085, + "grad_norm": 0.002609601942822337, + "learning_rate": 0.2817062178177753, + "loss": 0.1047, + "num_input_tokens_seen": 5752048, + "step": 6355 + }, + { + "epoch": 1.6786327042365052, + "grad_norm": 0.005732995457947254, + "learning_rate": 0.2816780167038593, + "loss": 0.1162, + "num_input_tokens_seen": 5756432, + "step": 6360 + }, + { + "epoch": 1.679952487792002, + "grad_norm": 0.005148160737007856, + "learning_rate": 0.28164979528353834, + "loss": 0.1009, + "num_input_tokens_seen": 5761072, + "step": 6365 + }, + { + "epoch": 1.681272271347499, + "grad_norm": 0.005397184751927853, + "learning_rate": 0.28162155356116453, + "loss": 0.1153, + "num_input_tokens_seen": 5765264, + "step": 6370 + }, + { + "epoch": 1.682592054902996, + "grad_norm": 0.003218236146494746, + "learning_rate": 0.28159329154109314, + "loss": 0.1322, + "num_input_tokens_seen": 5769808, + "step": 6375 + }, + { + "epoch": 1.6839118384584928, + "grad_norm": 0.0032866550609469414, + "learning_rate": 0.28156500922768246, + "loss": 0.0925, + "num_input_tokens_seen": 5774128, + "step": 6380 + }, + { + "epoch": 1.6852316220139896, + "grad_norm": 0.0021595750004053116, + "learning_rate": 0.28153670662529406, + "loss": 0.1318, + "num_input_tokens_seen": 5778512, + "step": 6385 + }, + { + "epoch": 1.6865514055694866, + "grad_norm": 0.004050437361001968, + "learning_rate": 0.28150838373829246, + "loss": 0.1145, + "num_input_tokens_seen": 5783056, + "step": 6390 + }, + { + "epoch": 1.6878711891249836, + "grad_norm": 0.004262225236743689, + "learning_rate": 0.2814800405710455, + "loss": 0.1209, + "num_input_tokens_seen": 5787600, + "step": 6395 + }, + { + "epoch": 1.6891909726804804, + "grad_norm": 0.005345833953469992, + "learning_rate": 0.2814516771279239, + "loss": 0.132, + "num_input_tokens_seen": 5792144, + "step": 6400 + }, + { + "epoch": 1.6891909726804804, + "eval_loss": 0.1156749427318573, + "eval_runtime": 75.8697, + "eval_samples_per_second": 88.771, + "eval_steps_per_second": 22.196, + "num_input_tokens_seen": 5792144, + "step": 6400 + }, + { + "epoch": 1.6905107562359774, + "grad_norm": 0.003240998601540923, + "learning_rate": 0.28142329341330186, + "loss": 0.1436, + "num_input_tokens_seen": 5796880, + "step": 6405 + }, + { + "epoch": 1.6918305397914741, + "grad_norm": 0.004001044202595949, + "learning_rate": 0.2813948894315564, + "loss": 0.0865, + "num_input_tokens_seen": 5801776, + "step": 6410 + }, + { + "epoch": 1.6931503233469711, + "grad_norm": 0.005806570872664452, + "learning_rate": 0.2813664651870677, + "loss": 0.1197, + "num_input_tokens_seen": 5806384, + "step": 6415 + }, + { + "epoch": 1.6944701069024681, + "grad_norm": 0.0033461342100054026, + "learning_rate": 0.28133802068421926, + "loss": 0.1425, + "num_input_tokens_seen": 5810576, + "step": 6420 + }, + { + "epoch": 1.695789890457965, + "grad_norm": 0.00447927275672555, + "learning_rate": 0.28130955592739754, + "loss": 0.1489, + "num_input_tokens_seen": 5815472, + "step": 6425 + }, + { + "epoch": 1.6971096740134617, + "grad_norm": 0.00342123256996274, + "learning_rate": 0.2812810709209922, + "loss": 0.1155, + "num_input_tokens_seen": 5819856, + "step": 6430 + }, + { + "epoch": 1.6984294575689587, + "grad_norm": 0.0033614984713494778, + "learning_rate": 0.2812525656693959, + "loss": 0.1155, + "num_input_tokens_seen": 5824272, + "step": 6435 + }, + { + "epoch": 1.6997492411244557, + "grad_norm": 0.002527072560042143, + "learning_rate": 0.28122404017700453, + "loss": 0.1294, + "num_input_tokens_seen": 5828816, + "step": 6440 + }, + { + "epoch": 1.7010690246799525, + "grad_norm": 0.002676017815247178, + "learning_rate": 0.2811954944482171, + "loss": 0.0929, + "num_input_tokens_seen": 5833232, + "step": 6445 + }, + { + "epoch": 1.7023888082354492, + "grad_norm": 0.003716310951858759, + "learning_rate": 0.2811669284874358, + "loss": 0.1132, + "num_input_tokens_seen": 5837680, + "step": 6450 + }, + { + "epoch": 1.7037085917909462, + "grad_norm": 0.00378919905051589, + "learning_rate": 0.2811383422990657, + "loss": 0.1339, + "num_input_tokens_seen": 5842192, + "step": 6455 + }, + { + "epoch": 1.7050283753464432, + "grad_norm": 0.004164665937423706, + "learning_rate": 0.2811097358875152, + "loss": 0.1395, + "num_input_tokens_seen": 5846576, + "step": 6460 + }, + { + "epoch": 1.70634815890194, + "grad_norm": 0.003884067991748452, + "learning_rate": 0.2810811092571959, + "loss": 0.1006, + "num_input_tokens_seen": 5851312, + "step": 6465 + }, + { + "epoch": 1.707667942457437, + "grad_norm": 0.0066137188114225864, + "learning_rate": 0.28105246241252224, + "loss": 0.1928, + "num_input_tokens_seen": 5856080, + "step": 6470 + }, + { + "epoch": 1.7089877260129338, + "grad_norm": 0.004035115707665682, + "learning_rate": 0.28102379535791194, + "loss": 0.1442, + "num_input_tokens_seen": 5860624, + "step": 6475 + }, + { + "epoch": 1.7103075095684308, + "grad_norm": 0.0018635854357853532, + "learning_rate": 0.2809951080977859, + "loss": 0.0916, + "num_input_tokens_seen": 5864880, + "step": 6480 + }, + { + "epoch": 1.7116272931239278, + "grad_norm": 0.003371815662831068, + "learning_rate": 0.28096640063656797, + "loss": 0.0998, + "num_input_tokens_seen": 5869168, + "step": 6485 + }, + { + "epoch": 1.7129470766794246, + "grad_norm": 0.003648288082331419, + "learning_rate": 0.2809376729786852, + "loss": 0.1368, + "num_input_tokens_seen": 5873712, + "step": 6490 + }, + { + "epoch": 1.7142668602349214, + "grad_norm": 0.003216848010197282, + "learning_rate": 0.28090892512856785, + "loss": 0.1061, + "num_input_tokens_seen": 5878256, + "step": 6495 + }, + { + "epoch": 1.7155866437904184, + "grad_norm": 0.0023968880996108055, + "learning_rate": 0.2808801570906491, + "loss": 0.109, + "num_input_tokens_seen": 5882768, + "step": 6500 + }, + { + "epoch": 1.7169064273459154, + "grad_norm": 0.004336456302553415, + "learning_rate": 0.2808513688693654, + "loss": 0.1386, + "num_input_tokens_seen": 5887504, + "step": 6505 + }, + { + "epoch": 1.7182262109014121, + "grad_norm": 0.005888712126761675, + "learning_rate": 0.28082256046915627, + "loss": 0.1403, + "num_input_tokens_seen": 5892304, + "step": 6510 + }, + { + "epoch": 1.719545994456909, + "grad_norm": 0.006341854576021433, + "learning_rate": 0.28079373189446427, + "loss": 0.0941, + "num_input_tokens_seen": 5896656, + "step": 6515 + }, + { + "epoch": 1.720865778012406, + "grad_norm": 0.004961774684488773, + "learning_rate": 0.28076488314973513, + "loss": 0.1222, + "num_input_tokens_seen": 5901360, + "step": 6520 + }, + { + "epoch": 1.722185561567903, + "grad_norm": 0.005088921170681715, + "learning_rate": 0.28073601423941774, + "loss": 0.1203, + "num_input_tokens_seen": 5905648, + "step": 6525 + }, + { + "epoch": 1.7235053451234, + "grad_norm": 0.004107099492102861, + "learning_rate": 0.28070712516796403, + "loss": 0.1445, + "num_input_tokens_seen": 5910128, + "step": 6530 + }, + { + "epoch": 1.7248251286788967, + "grad_norm": 0.0030517037957906723, + "learning_rate": 0.28067821593982906, + "loss": 0.1276, + "num_input_tokens_seen": 5914640, + "step": 6535 + }, + { + "epoch": 1.7261449122343935, + "grad_norm": 0.0028372204396873713, + "learning_rate": 0.28064928655947097, + "loss": 0.0818, + "num_input_tokens_seen": 5919408, + "step": 6540 + }, + { + "epoch": 1.7274646957898905, + "grad_norm": 0.008413886651396751, + "learning_rate": 0.28062033703135103, + "loss": 0.184, + "num_input_tokens_seen": 5923728, + "step": 6545 + }, + { + "epoch": 1.7287844793453875, + "grad_norm": 0.004774109926074743, + "learning_rate": 0.2805913673599337, + "loss": 0.1099, + "num_input_tokens_seen": 5928080, + "step": 6550 + }, + { + "epoch": 1.7301042629008843, + "grad_norm": 0.002425720915198326, + "learning_rate": 0.2805623775496864, + "loss": 0.1145, + "num_input_tokens_seen": 5932720, + "step": 6555 + }, + { + "epoch": 1.731424046456381, + "grad_norm": 0.0027559977024793625, + "learning_rate": 0.2805333676050797, + "loss": 0.0904, + "num_input_tokens_seen": 5937392, + "step": 6560 + }, + { + "epoch": 1.732743830011878, + "grad_norm": 0.005258992779999971, + "learning_rate": 0.2805043375305873, + "loss": 0.1117, + "num_input_tokens_seen": 5941904, + "step": 6565 + }, + { + "epoch": 1.734063613567375, + "grad_norm": 0.004337307531386614, + "learning_rate": 0.2804752873306861, + "loss": 0.0998, + "num_input_tokens_seen": 5946352, + "step": 6570 + }, + { + "epoch": 1.7353833971228718, + "grad_norm": 0.006350680720061064, + "learning_rate": 0.2804462170098559, + "loss": 0.1071, + "num_input_tokens_seen": 5950896, + "step": 6575 + }, + { + "epoch": 1.7367031806783686, + "grad_norm": 0.0064525846391916275, + "learning_rate": 0.2804171265725797, + "loss": 0.1515, + "num_input_tokens_seen": 5955632, + "step": 6580 + }, + { + "epoch": 1.7380229642338656, + "grad_norm": 0.004220061004161835, + "learning_rate": 0.28038801602334373, + "loss": 0.1253, + "num_input_tokens_seen": 5959984, + "step": 6585 + }, + { + "epoch": 1.7393427477893626, + "grad_norm": 0.006610743701457977, + "learning_rate": 0.28035888536663717, + "loss": 0.1316, + "num_input_tokens_seen": 5964400, + "step": 6590 + }, + { + "epoch": 1.7406625313448596, + "grad_norm": 0.004140675067901611, + "learning_rate": 0.2803297346069522, + "loss": 0.1088, + "num_input_tokens_seen": 5969328, + "step": 6595 + }, + { + "epoch": 1.7419823149003564, + "grad_norm": 0.005148195195943117, + "learning_rate": 0.28030056374878437, + "loss": 0.1404, + "num_input_tokens_seen": 5974000, + "step": 6600 + }, + { + "epoch": 1.7419823149003564, + "eval_loss": 0.11491036415100098, + "eval_runtime": 75.9465, + "eval_samples_per_second": 88.681, + "eval_steps_per_second": 22.174, + "num_input_tokens_seen": 5974000, + "step": 6600 + }, + { + "epoch": 1.7433020984558532, + "grad_norm": 0.004873611498624086, + "learning_rate": 0.2802713727966321, + "loss": 0.097, + "num_input_tokens_seen": 5978672, + "step": 6605 + }, + { + "epoch": 1.7446218820113502, + "grad_norm": 0.002372570801526308, + "learning_rate": 0.28024216175499717, + "loss": 0.0903, + "num_input_tokens_seen": 5983216, + "step": 6610 + }, + { + "epoch": 1.7459416655668472, + "grad_norm": 0.002300061285495758, + "learning_rate": 0.2802129306283841, + "loss": 0.1272, + "num_input_tokens_seen": 5987728, + "step": 6615 + }, + { + "epoch": 1.747261449122344, + "grad_norm": 0.0038662836886942387, + "learning_rate": 0.28018367942130074, + "loss": 0.1017, + "num_input_tokens_seen": 5992048, + "step": 6620 + }, + { + "epoch": 1.7485812326778407, + "grad_norm": 0.005044789984822273, + "learning_rate": 0.28015440813825804, + "loss": 0.1663, + "num_input_tokens_seen": 5997040, + "step": 6625 + }, + { + "epoch": 1.7499010162333377, + "grad_norm": 0.002926019486039877, + "learning_rate": 0.28012511678377006, + "loss": 0.1206, + "num_input_tokens_seen": 6001488, + "step": 6630 + }, + { + "epoch": 1.7512207997888347, + "grad_norm": 0.0030716152396053076, + "learning_rate": 0.28009580536235373, + "loss": 0.1251, + "num_input_tokens_seen": 6005872, + "step": 6635 + }, + { + "epoch": 1.7525405833443315, + "grad_norm": 0.003187052207067609, + "learning_rate": 0.28006647387852934, + "loss": 0.1291, + "num_input_tokens_seen": 6010576, + "step": 6640 + }, + { + "epoch": 1.7538603668998283, + "grad_norm": 0.002134262816980481, + "learning_rate": 0.28003712233682015, + "loss": 0.0905, + "num_input_tokens_seen": 6015472, + "step": 6645 + }, + { + "epoch": 1.7551801504553253, + "grad_norm": 0.007441185414791107, + "learning_rate": 0.2800077507417526, + "loss": 0.1292, + "num_input_tokens_seen": 6020272, + "step": 6650 + }, + { + "epoch": 1.7564999340108223, + "grad_norm": 0.0033294139429926872, + "learning_rate": 0.2799783590978561, + "loss": 0.1159, + "num_input_tokens_seen": 6024496, + "step": 6655 + }, + { + "epoch": 1.7578197175663193, + "grad_norm": 0.003509930334985256, + "learning_rate": 0.2799489474096632, + "loss": 0.1009, + "num_input_tokens_seen": 6028976, + "step": 6660 + }, + { + "epoch": 1.759139501121816, + "grad_norm": 0.002848699688911438, + "learning_rate": 0.27991951568170953, + "loss": 0.1037, + "num_input_tokens_seen": 6033360, + "step": 6665 + }, + { + "epoch": 1.7604592846773128, + "grad_norm": 0.006154004018753767, + "learning_rate": 0.2798900639185339, + "loss": 0.1348, + "num_input_tokens_seen": 6038096, + "step": 6670 + }, + { + "epoch": 1.7617790682328098, + "grad_norm": 0.003978870343416929, + "learning_rate": 0.2798605921246781, + "loss": 0.1069, + "num_input_tokens_seen": 6042128, + "step": 6675 + }, + { + "epoch": 1.7630988517883068, + "grad_norm": 0.002532073063775897, + "learning_rate": 0.2798311003046871, + "loss": 0.1163, + "num_input_tokens_seen": 6046576, + "step": 6680 + }, + { + "epoch": 1.7644186353438036, + "grad_norm": 0.005081879440695047, + "learning_rate": 0.2798015884631089, + "loss": 0.1343, + "num_input_tokens_seen": 6050800, + "step": 6685 + }, + { + "epoch": 1.7657384188993004, + "grad_norm": 0.005232254043221474, + "learning_rate": 0.27977205660449445, + "loss": 0.1132, + "num_input_tokens_seen": 6055408, + "step": 6690 + }, + { + "epoch": 1.7670582024547974, + "grad_norm": 0.003956780768930912, + "learning_rate": 0.2797425047333981, + "loss": 0.0888, + "num_input_tokens_seen": 6059696, + "step": 6695 + }, + { + "epoch": 1.7683779860102944, + "grad_norm": 0.0029293729458004236, + "learning_rate": 0.27971293285437715, + "loss": 0.0896, + "num_input_tokens_seen": 6064144, + "step": 6700 + }, + { + "epoch": 1.7696977695657912, + "grad_norm": 0.004405713640153408, + "learning_rate": 0.2796833409719918, + "loss": 0.1058, + "num_input_tokens_seen": 6068752, + "step": 6705 + }, + { + "epoch": 1.771017553121288, + "grad_norm": 0.0044570136815309525, + "learning_rate": 0.27965372909080566, + "loss": 0.1512, + "num_input_tokens_seen": 6073488, + "step": 6710 + }, + { + "epoch": 1.772337336676785, + "grad_norm": 0.0034688732121139765, + "learning_rate": 0.27962409721538506, + "loss": 0.0489, + "num_input_tokens_seen": 6077520, + "step": 6715 + }, + { + "epoch": 1.773657120232282, + "grad_norm": 0.0032089909072965384, + "learning_rate": 0.27959444535029976, + "loss": 0.1428, + "num_input_tokens_seen": 6082256, + "step": 6720 + }, + { + "epoch": 1.774976903787779, + "grad_norm": 0.0015486984048038721, + "learning_rate": 0.27956477350012243, + "loss": 0.1057, + "num_input_tokens_seen": 6086672, + "step": 6725 + }, + { + "epoch": 1.7762966873432757, + "grad_norm": 0.0032128416933119297, + "learning_rate": 0.27953508166942875, + "loss": 0.0979, + "num_input_tokens_seen": 6090832, + "step": 6730 + }, + { + "epoch": 1.7776164708987725, + "grad_norm": 0.0015094521222636104, + "learning_rate": 0.27950536986279767, + "loss": 0.1246, + "num_input_tokens_seen": 6095600, + "step": 6735 + }, + { + "epoch": 1.7789362544542695, + "grad_norm": 0.0034355921670794487, + "learning_rate": 0.2794756380848111, + "loss": 0.1202, + "num_input_tokens_seen": 6100304, + "step": 6740 + }, + { + "epoch": 1.7802560380097665, + "grad_norm": 0.004039716441184282, + "learning_rate": 0.279445886340054, + "loss": 0.159, + "num_input_tokens_seen": 6104816, + "step": 6745 + }, + { + "epoch": 1.7815758215652633, + "grad_norm": 0.001891267136670649, + "learning_rate": 0.27941611463311455, + "loss": 0.109, + "num_input_tokens_seen": 6109520, + "step": 6750 + }, + { + "epoch": 1.78289560512076, + "grad_norm": 0.005636060610413551, + "learning_rate": 0.2793863229685839, + "loss": 0.135, + "num_input_tokens_seen": 6114000, + "step": 6755 + }, + { + "epoch": 1.784215388676257, + "grad_norm": 0.005857315380126238, + "learning_rate": 0.27935651135105627, + "loss": 0.1098, + "num_input_tokens_seen": 6118864, + "step": 6760 + }, + { + "epoch": 1.785535172231754, + "grad_norm": 0.004989219829440117, + "learning_rate": 0.279326679785129, + "loss": 0.1192, + "num_input_tokens_seen": 6123376, + "step": 6765 + }, + { + "epoch": 1.7868549557872508, + "grad_norm": 0.005461641121655703, + "learning_rate": 0.2792968282754024, + "loss": 0.1064, + "num_input_tokens_seen": 6127888, + "step": 6770 + }, + { + "epoch": 1.7881747393427478, + "grad_norm": 0.008630335330963135, + "learning_rate": 0.2792669568264801, + "loss": 0.1476, + "num_input_tokens_seen": 6132464, + "step": 6775 + }, + { + "epoch": 1.7894945228982446, + "grad_norm": 0.0054665240459144115, + "learning_rate": 0.27923706544296856, + "loss": 0.1165, + "num_input_tokens_seen": 6136688, + "step": 6780 + }, + { + "epoch": 1.7908143064537416, + "grad_norm": 0.003313109278678894, + "learning_rate": 0.2792071541294775, + "loss": 0.1011, + "num_input_tokens_seen": 6141328, + "step": 6785 + }, + { + "epoch": 1.7921340900092386, + "grad_norm": 0.0052740671671926975, + "learning_rate": 0.27917722289061947, + "loss": 0.1278, + "num_input_tokens_seen": 6145840, + "step": 6790 + }, + { + "epoch": 1.7934538735647354, + "grad_norm": 0.002337298821657896, + "learning_rate": 0.27914727173101034, + "loss": 0.1288, + "num_input_tokens_seen": 6150000, + "step": 6795 + }, + { + "epoch": 1.7947736571202322, + "grad_norm": 0.003659531008452177, + "learning_rate": 0.279117300655269, + "loss": 0.1209, + "num_input_tokens_seen": 6154320, + "step": 6800 + }, + { + "epoch": 1.7947736571202322, + "eval_loss": 0.11580384522676468, + "eval_runtime": 75.845, + "eval_samples_per_second": 88.799, + "eval_steps_per_second": 22.203, + "num_input_tokens_seen": 6154320, + "step": 6800 + }, + { + "epoch": 1.7960934406757292, + "grad_norm": 0.003885297803208232, + "learning_rate": 0.2790873096680173, + "loss": 0.1058, + "num_input_tokens_seen": 6158960, + "step": 6805 + }, + { + "epoch": 1.7974132242312262, + "grad_norm": 0.0049006156623363495, + "learning_rate": 0.2790572987738802, + "loss": 0.0994, + "num_input_tokens_seen": 6163696, + "step": 6810 + }, + { + "epoch": 1.798733007786723, + "grad_norm": 0.003727799979969859, + "learning_rate": 0.27902726797748584, + "loss": 0.1288, + "num_input_tokens_seen": 6168112, + "step": 6815 + }, + { + "epoch": 1.8000527913422197, + "grad_norm": 0.004069710150361061, + "learning_rate": 0.2789972172834652, + "loss": 0.1201, + "num_input_tokens_seen": 6172848, + "step": 6820 + }, + { + "epoch": 1.8013725748977167, + "grad_norm": 0.004677552729845047, + "learning_rate": 0.2789671466964527, + "loss": 0.1181, + "num_input_tokens_seen": 6177488, + "step": 6825 + }, + { + "epoch": 1.8026923584532137, + "grad_norm": 0.006744274403899908, + "learning_rate": 0.2789370562210854, + "loss": 0.1064, + "num_input_tokens_seen": 6181840, + "step": 6830 + }, + { + "epoch": 1.8040121420087105, + "grad_norm": 0.004085478372871876, + "learning_rate": 0.27890694586200376, + "loss": 0.1296, + "num_input_tokens_seen": 6186256, + "step": 6835 + }, + { + "epoch": 1.8053319255642075, + "grad_norm": 0.0025377708952873945, + "learning_rate": 0.2788768156238511, + "loss": 0.1364, + "num_input_tokens_seen": 6190768, + "step": 6840 + }, + { + "epoch": 1.8066517091197043, + "grad_norm": 0.003895980305969715, + "learning_rate": 0.27884666551127385, + "loss": 0.1162, + "num_input_tokens_seen": 6195088, + "step": 6845 + }, + { + "epoch": 1.8079714926752013, + "grad_norm": 0.0038535678759217262, + "learning_rate": 0.2788164955289217, + "loss": 0.1265, + "num_input_tokens_seen": 6199536, + "step": 6850 + }, + { + "epoch": 1.8092912762306983, + "grad_norm": 0.004427267238497734, + "learning_rate": 0.27878630568144697, + "loss": 0.1356, + "num_input_tokens_seen": 6204304, + "step": 6855 + }, + { + "epoch": 1.810611059786195, + "grad_norm": 0.002369973110035062, + "learning_rate": 0.2787560959735056, + "loss": 0.113, + "num_input_tokens_seen": 6208784, + "step": 6860 + }, + { + "epoch": 1.8119308433416919, + "grad_norm": 0.008218875154852867, + "learning_rate": 0.27872586640975616, + "loss": 0.1396, + "num_input_tokens_seen": 6213392, + "step": 6865 + }, + { + "epoch": 1.8132506268971889, + "grad_norm": 0.005639341194182634, + "learning_rate": 0.27869561699486045, + "loss": 0.1044, + "num_input_tokens_seen": 6217872, + "step": 6870 + }, + { + "epoch": 1.8145704104526859, + "grad_norm": 0.003312513465061784, + "learning_rate": 0.2786653477334833, + "loss": 0.1116, + "num_input_tokens_seen": 6222320, + "step": 6875 + }, + { + "epoch": 1.8158901940081826, + "grad_norm": 0.0025355080142617226, + "learning_rate": 0.2786350586302926, + "loss": 0.1354, + "num_input_tokens_seen": 6226704, + "step": 6880 + }, + { + "epoch": 1.8172099775636794, + "grad_norm": 0.0038439175114035606, + "learning_rate": 0.27860474968995935, + "loss": 0.1299, + "num_input_tokens_seen": 6231408, + "step": 6885 + }, + { + "epoch": 1.8185297611191764, + "grad_norm": 0.003950489219278097, + "learning_rate": 0.27857442091715756, + "loss": 0.1291, + "num_input_tokens_seen": 6235856, + "step": 6890 + }, + { + "epoch": 1.8198495446746734, + "grad_norm": 0.003719951491802931, + "learning_rate": 0.27854407231656425, + "loss": 0.1421, + "num_input_tokens_seen": 6240400, + "step": 6895 + }, + { + "epoch": 1.8211693282301704, + "grad_norm": 0.0023302617482841015, + "learning_rate": 0.2785137038928596, + "loss": 0.1171, + "num_input_tokens_seen": 6244816, + "step": 6900 + }, + { + "epoch": 1.8224891117856672, + "grad_norm": 0.002470938954502344, + "learning_rate": 0.27848331565072687, + "loss": 0.0925, + "num_input_tokens_seen": 6249264, + "step": 6905 + }, + { + "epoch": 1.823808895341164, + "grad_norm": 0.004445945844054222, + "learning_rate": 0.27845290759485225, + "loss": 0.1015, + "num_input_tokens_seen": 6253712, + "step": 6910 + }, + { + "epoch": 1.825128678896661, + "grad_norm": 0.005275303032249212, + "learning_rate": 0.278422479729925, + "loss": 0.1267, + "num_input_tokens_seen": 6258128, + "step": 6915 + }, + { + "epoch": 1.826448462452158, + "grad_norm": 0.003286063903942704, + "learning_rate": 0.2783920320606375, + "loss": 0.1077, + "num_input_tokens_seen": 6262832, + "step": 6920 + }, + { + "epoch": 1.8277682460076548, + "grad_norm": 0.00423590000718832, + "learning_rate": 0.2783615645916852, + "loss": 0.1319, + "num_input_tokens_seen": 6266928, + "step": 6925 + }, + { + "epoch": 1.8290880295631515, + "grad_norm": 0.005324759986251593, + "learning_rate": 0.2783310773277666, + "loss": 0.1064, + "num_input_tokens_seen": 6271344, + "step": 6930 + }, + { + "epoch": 1.8304078131186485, + "grad_norm": 0.0036011466290801764, + "learning_rate": 0.2783005702735831, + "loss": 0.1765, + "num_input_tokens_seen": 6276144, + "step": 6935 + }, + { + "epoch": 1.8317275966741455, + "grad_norm": 0.0018706758273765445, + "learning_rate": 0.2782700434338394, + "loss": 0.0912, + "num_input_tokens_seen": 6280592, + "step": 6940 + }, + { + "epoch": 1.8330473802296423, + "grad_norm": 0.004009801894426346, + "learning_rate": 0.278239496813243, + "loss": 0.1103, + "num_input_tokens_seen": 6285296, + "step": 6945 + }, + { + "epoch": 1.834367163785139, + "grad_norm": 0.004504792392253876, + "learning_rate": 0.27820893041650463, + "loss": 0.1205, + "num_input_tokens_seen": 6289680, + "step": 6950 + }, + { + "epoch": 1.835686947340636, + "grad_norm": 0.002085951156914234, + "learning_rate": 0.27817834424833804, + "loss": 0.0859, + "num_input_tokens_seen": 6294192, + "step": 6955 + }, + { + "epoch": 1.837006730896133, + "grad_norm": 0.0034562130458652973, + "learning_rate": 0.27814773831345996, + "loss": 0.1348, + "num_input_tokens_seen": 6298704, + "step": 6960 + }, + { + "epoch": 1.83832651445163, + "grad_norm": 0.002501045586541295, + "learning_rate": 0.2781171126165902, + "loss": 0.1062, + "num_input_tokens_seen": 6303056, + "step": 6965 + }, + { + "epoch": 1.8396462980071269, + "grad_norm": 0.0025873431004583836, + "learning_rate": 0.2780864671624517, + "loss": 0.1362, + "num_input_tokens_seen": 6307312, + "step": 6970 + }, + { + "epoch": 1.8409660815626236, + "grad_norm": 0.0016587282298132777, + "learning_rate": 0.27805580195577034, + "loss": 0.0765, + "num_input_tokens_seen": 6311856, + "step": 6975 + }, + { + "epoch": 1.8422858651181206, + "grad_norm": 0.004103429149836302, + "learning_rate": 0.2780251170012751, + "loss": 0.1195, + "num_input_tokens_seen": 6316656, + "step": 6980 + }, + { + "epoch": 1.8436056486736176, + "grad_norm": 0.0034734918735921383, + "learning_rate": 0.27799441230369787, + "loss": 0.1135, + "num_input_tokens_seen": 6321072, + "step": 6985 + }, + { + "epoch": 1.8449254322291144, + "grad_norm": 0.004573277197778225, + "learning_rate": 0.27796368786777387, + "loss": 0.1123, + "num_input_tokens_seen": 6325456, + "step": 6990 + }, + { + "epoch": 1.8462452157846112, + "grad_norm": 0.0017830622382462025, + "learning_rate": 0.277932943698241, + "loss": 0.0896, + "num_input_tokens_seen": 6330000, + "step": 6995 + }, + { + "epoch": 1.8475649993401082, + "grad_norm": 0.0034493880812078714, + "learning_rate": 0.2779021797998406, + "loss": 0.1249, + "num_input_tokens_seen": 6334608, + "step": 7000 + }, + { + "epoch": 1.8475649993401082, + "eval_loss": 0.11815767735242844, + "eval_runtime": 75.8744, + "eval_samples_per_second": 88.765, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 6334608, + "step": 7000 + }, + { + "epoch": 1.8488847828956052, + "grad_norm": 0.002478705020621419, + "learning_rate": 0.2778713961773167, + "loss": 0.0997, + "num_input_tokens_seen": 6339152, + "step": 7005 + }, + { + "epoch": 1.850204566451102, + "grad_norm": 0.004532189574092627, + "learning_rate": 0.2778405928354166, + "loss": 0.1536, + "num_input_tokens_seen": 6343856, + "step": 7010 + }, + { + "epoch": 1.8515243500065988, + "grad_norm": 0.0018282467499375343, + "learning_rate": 0.27780976977889055, + "loss": 0.117, + "num_input_tokens_seen": 6348432, + "step": 7015 + }, + { + "epoch": 1.8528441335620958, + "grad_norm": 0.0047298395074903965, + "learning_rate": 0.27777892701249185, + "loss": 0.1382, + "num_input_tokens_seen": 6352880, + "step": 7020 + }, + { + "epoch": 1.8541639171175928, + "grad_norm": 0.003906717989593744, + "learning_rate": 0.2777480645409768, + "loss": 0.1202, + "num_input_tokens_seen": 6357264, + "step": 7025 + }, + { + "epoch": 1.8554837006730898, + "grad_norm": 0.004401597194373608, + "learning_rate": 0.27771718236910486, + "loss": 0.1254, + "num_input_tokens_seen": 6361520, + "step": 7030 + }, + { + "epoch": 1.8568034842285865, + "grad_norm": 0.005818085744976997, + "learning_rate": 0.27768628050163835, + "loss": 0.1174, + "num_input_tokens_seen": 6365648, + "step": 7035 + }, + { + "epoch": 1.8581232677840833, + "grad_norm": 0.004071171395480633, + "learning_rate": 0.2776553589433428, + "loss": 0.1341, + "num_input_tokens_seen": 6370256, + "step": 7040 + }, + { + "epoch": 1.8594430513395803, + "grad_norm": 0.0036609191447496414, + "learning_rate": 0.27762441769898666, + "loss": 0.144, + "num_input_tokens_seen": 6374800, + "step": 7045 + }, + { + "epoch": 1.8607628348950773, + "grad_norm": 0.005145995877683163, + "learning_rate": 0.2775934567733415, + "loss": 0.1372, + "num_input_tokens_seen": 6379504, + "step": 7050 + }, + { + "epoch": 1.862082618450574, + "grad_norm": 0.00358567014336586, + "learning_rate": 0.2775624761711819, + "loss": 0.1007, + "num_input_tokens_seen": 6384048, + "step": 7055 + }, + { + "epoch": 1.8634024020060709, + "grad_norm": 0.003980044275522232, + "learning_rate": 0.2775314758972854, + "loss": 0.0999, + "num_input_tokens_seen": 6388688, + "step": 7060 + }, + { + "epoch": 1.8647221855615679, + "grad_norm": 0.0043825507164001465, + "learning_rate": 0.2775004559564327, + "loss": 0.1063, + "num_input_tokens_seen": 6393392, + "step": 7065 + }, + { + "epoch": 1.8660419691170649, + "grad_norm": 0.004360267426818609, + "learning_rate": 0.2774694163534073, + "loss": 0.1358, + "num_input_tokens_seen": 6398064, + "step": 7070 + }, + { + "epoch": 1.8673617526725617, + "grad_norm": 0.005925125442445278, + "learning_rate": 0.27743835709299614, + "loss": 0.1369, + "num_input_tokens_seen": 6402736, + "step": 7075 + }, + { + "epoch": 1.8686815362280584, + "grad_norm": 0.0036532606463879347, + "learning_rate": 0.2774072781799888, + "loss": 0.1083, + "num_input_tokens_seen": 6407504, + "step": 7080 + }, + { + "epoch": 1.8700013197835554, + "grad_norm": 0.003034481080248952, + "learning_rate": 0.27737617961917804, + "loss": 0.1714, + "num_input_tokens_seen": 6411824, + "step": 7085 + }, + { + "epoch": 1.8713211033390524, + "grad_norm": 0.004999583121389151, + "learning_rate": 0.27734506141535964, + "loss": 0.1008, + "num_input_tokens_seen": 6416112, + "step": 7090 + }, + { + "epoch": 1.8726408868945494, + "grad_norm": 0.003280516481027007, + "learning_rate": 0.2773139235733325, + "loss": 0.1169, + "num_input_tokens_seen": 6420656, + "step": 7095 + }, + { + "epoch": 1.8739606704500462, + "grad_norm": 0.0018769819289445877, + "learning_rate": 0.2772827660978984, + "loss": 0.1058, + "num_input_tokens_seen": 6425392, + "step": 7100 + }, + { + "epoch": 1.875280454005543, + "grad_norm": 0.004437025170773268, + "learning_rate": 0.27725158899386226, + "loss": 0.0949, + "num_input_tokens_seen": 6429520, + "step": 7105 + }, + { + "epoch": 1.87660023756104, + "grad_norm": 0.004440493881702423, + "learning_rate": 0.27722039226603196, + "loss": 0.1434, + "num_input_tokens_seen": 6434416, + "step": 7110 + }, + { + "epoch": 1.877920021116537, + "grad_norm": 0.004361602012068033, + "learning_rate": 0.2771891759192184, + "loss": 0.1462, + "num_input_tokens_seen": 6439248, + "step": 7115 + }, + { + "epoch": 1.8792398046720338, + "grad_norm": 0.004659480880945921, + "learning_rate": 0.2771579399582355, + "loss": 0.1334, + "num_input_tokens_seen": 6443728, + "step": 7120 + }, + { + "epoch": 1.8805595882275306, + "grad_norm": 0.00161891826428473, + "learning_rate": 0.2771266843879004, + "loss": 0.0841, + "num_input_tokens_seen": 6448240, + "step": 7125 + }, + { + "epoch": 1.8818793717830276, + "grad_norm": 0.001866077771410346, + "learning_rate": 0.2770954092130329, + "loss": 0.1059, + "num_input_tokens_seen": 6452528, + "step": 7130 + }, + { + "epoch": 1.8831991553385246, + "grad_norm": 0.0025170736480504274, + "learning_rate": 0.27706411443845613, + "loss": 0.1145, + "num_input_tokens_seen": 6456912, + "step": 7135 + }, + { + "epoch": 1.8845189388940213, + "grad_norm": 0.002423736033961177, + "learning_rate": 0.27703280006899617, + "loss": 0.119, + "num_input_tokens_seen": 6461616, + "step": 7140 + }, + { + "epoch": 1.8858387224495183, + "grad_norm": 0.0019903036300092936, + "learning_rate": 0.277001466109482, + "loss": 0.0906, + "num_input_tokens_seen": 6465968, + "step": 7145 + }, + { + "epoch": 1.8871585060050151, + "grad_norm": 0.0019872949924319983, + "learning_rate": 0.2769701125647458, + "loss": 0.1372, + "num_input_tokens_seen": 6470512, + "step": 7150 + }, + { + "epoch": 1.8884782895605121, + "grad_norm": 0.002943481784313917, + "learning_rate": 0.27693873943962266, + "loss": 0.1325, + "num_input_tokens_seen": 6474896, + "step": 7155 + }, + { + "epoch": 1.8897980731160091, + "grad_norm": 0.0017637208802625537, + "learning_rate": 0.2769073467389506, + "loss": 0.1019, + "num_input_tokens_seen": 6479504, + "step": 7160 + }, + { + "epoch": 1.891117856671506, + "grad_norm": 0.002363082952797413, + "learning_rate": 0.2768759344675709, + "loss": 0.0798, + "num_input_tokens_seen": 6484112, + "step": 7165 + }, + { + "epoch": 1.8924376402270027, + "grad_norm": 0.005562731064856052, + "learning_rate": 0.27684450263032767, + "loss": 0.1628, + "num_input_tokens_seen": 6488560, + "step": 7170 + }, + { + "epoch": 1.8937574237824997, + "grad_norm": 0.0018878268310800195, + "learning_rate": 0.2768130512320682, + "loss": 0.1235, + "num_input_tokens_seen": 6492784, + "step": 7175 + }, + { + "epoch": 1.8950772073379967, + "grad_norm": 0.00297328969463706, + "learning_rate": 0.27678158027764244, + "loss": 0.1192, + "num_input_tokens_seen": 6497040, + "step": 7180 + }, + { + "epoch": 1.8963969908934935, + "grad_norm": 0.005704327020794153, + "learning_rate": 0.27675008977190385, + "loss": 0.1375, + "num_input_tokens_seen": 6501744, + "step": 7185 + }, + { + "epoch": 1.8977167744489902, + "grad_norm": 0.0031946010421961546, + "learning_rate": 0.2767185797197086, + "loss": 0.098, + "num_input_tokens_seen": 6506224, + "step": 7190 + }, + { + "epoch": 1.8990365580044872, + "grad_norm": 0.002850177465006709, + "learning_rate": 0.2766870501259159, + "loss": 0.1127, + "num_input_tokens_seen": 6510736, + "step": 7195 + }, + { + "epoch": 1.9003563415599842, + "grad_norm": 0.0013057257747277617, + "learning_rate": 0.276655500995388, + "loss": 0.0722, + "num_input_tokens_seen": 6515152, + "step": 7200 + }, + { + "epoch": 1.9003563415599842, + "eval_loss": 0.11629738658666611, + "eval_runtime": 75.809, + "eval_samples_per_second": 88.842, + "eval_steps_per_second": 22.214, + "num_input_tokens_seen": 6515152, + "step": 7200 + }, + { + "epoch": 1.901676125115481, + "grad_norm": 0.006641519721597433, + "learning_rate": 0.27662393233299015, + "loss": 0.1369, + "num_input_tokens_seen": 6519920, + "step": 7205 + }, + { + "epoch": 1.902995908670978, + "grad_norm": 0.0027041027788072824, + "learning_rate": 0.27659234414359074, + "loss": 0.0806, + "num_input_tokens_seen": 6524752, + "step": 7210 + }, + { + "epoch": 1.9043156922264748, + "grad_norm": 0.004387938417494297, + "learning_rate": 0.27656073643206097, + "loss": 0.1088, + "num_input_tokens_seen": 6529232, + "step": 7215 + }, + { + "epoch": 1.9056354757819718, + "grad_norm": 0.006496252957731485, + "learning_rate": 0.27652910920327517, + "loss": 0.1463, + "num_input_tokens_seen": 6533872, + "step": 7220 + }, + { + "epoch": 1.9069552593374688, + "grad_norm": 0.0018127411603927612, + "learning_rate": 0.2764974624621107, + "loss": 0.0991, + "num_input_tokens_seen": 6538416, + "step": 7225 + }, + { + "epoch": 1.9082750428929656, + "grad_norm": 0.003497915342450142, + "learning_rate": 0.2764657962134479, + "loss": 0.1175, + "num_input_tokens_seen": 6543056, + "step": 7230 + }, + { + "epoch": 1.9095948264484623, + "grad_norm": 0.005615835078060627, + "learning_rate": 0.27643411046217, + "loss": 0.0575, + "num_input_tokens_seen": 6547504, + "step": 7235 + }, + { + "epoch": 1.9109146100039593, + "grad_norm": 0.005171490367501974, + "learning_rate": 0.27640240521316334, + "loss": 0.1399, + "num_input_tokens_seen": 6551920, + "step": 7240 + }, + { + "epoch": 1.9122343935594563, + "grad_norm": 0.0028191928286105394, + "learning_rate": 0.2763706804713174, + "loss": 0.1596, + "num_input_tokens_seen": 6556432, + "step": 7245 + }, + { + "epoch": 1.9135541771149531, + "grad_norm": 0.0025738428812474012, + "learning_rate": 0.2763389362415245, + "loss": 0.15, + "num_input_tokens_seen": 6560784, + "step": 7250 + }, + { + "epoch": 1.91487396067045, + "grad_norm": 0.002627660520374775, + "learning_rate": 0.27630717252867987, + "loss": 0.1297, + "num_input_tokens_seen": 6565456, + "step": 7255 + }, + { + "epoch": 1.916193744225947, + "grad_norm": 0.0032320236787199974, + "learning_rate": 0.276275389337682, + "loss": 0.1429, + "num_input_tokens_seen": 6569872, + "step": 7260 + }, + { + "epoch": 1.917513527781444, + "grad_norm": 0.0026170173659920692, + "learning_rate": 0.2762435866734322, + "loss": 0.0952, + "num_input_tokens_seen": 6574416, + "step": 7265 + }, + { + "epoch": 1.918833311336941, + "grad_norm": 0.003220031503587961, + "learning_rate": 0.27621176454083485, + "loss": 0.1005, + "num_input_tokens_seen": 6578704, + "step": 7270 + }, + { + "epoch": 1.9201530948924377, + "grad_norm": 0.006116300355643034, + "learning_rate": 0.2761799229447973, + "loss": 0.1439, + "num_input_tokens_seen": 6583056, + "step": 7275 + }, + { + "epoch": 1.9214728784479345, + "grad_norm": 0.003928970079869032, + "learning_rate": 0.27614806189023006, + "loss": 0.104, + "num_input_tokens_seen": 6587440, + "step": 7280 + }, + { + "epoch": 1.9227926620034315, + "grad_norm": 0.004930334165692329, + "learning_rate": 0.27611618138204636, + "loss": 0.1724, + "num_input_tokens_seen": 6592016, + "step": 7285 + }, + { + "epoch": 1.9241124455589285, + "grad_norm": 0.0034714548382908106, + "learning_rate": 0.2760842814251626, + "loss": 0.1379, + "num_input_tokens_seen": 6596400, + "step": 7290 + }, + { + "epoch": 1.9254322291144252, + "grad_norm": 0.004433969501405954, + "learning_rate": 0.2760523620244982, + "loss": 0.1572, + "num_input_tokens_seen": 6600944, + "step": 7295 + }, + { + "epoch": 1.926752012669922, + "grad_norm": 0.0015110441017895937, + "learning_rate": 0.27602042318497544, + "loss": 0.0811, + "num_input_tokens_seen": 6605648, + "step": 7300 + }, + { + "epoch": 1.928071796225419, + "grad_norm": 0.004257429391145706, + "learning_rate": 0.2759884649115198, + "loss": 0.1185, + "num_input_tokens_seen": 6610096, + "step": 7305 + }, + { + "epoch": 1.929391579780916, + "grad_norm": 0.0021671804133802652, + "learning_rate": 0.2759564872090596, + "loss": 0.0938, + "num_input_tokens_seen": 6614704, + "step": 7310 + }, + { + "epoch": 1.9307113633364128, + "grad_norm": 0.0031617851927876472, + "learning_rate": 0.2759244900825262, + "loss": 0.099, + "num_input_tokens_seen": 6619056, + "step": 7315 + }, + { + "epoch": 1.9320311468919096, + "grad_norm": 0.005703274626284838, + "learning_rate": 0.2758924735368539, + "loss": 0.0912, + "num_input_tokens_seen": 6623632, + "step": 7320 + }, + { + "epoch": 1.9333509304474066, + "grad_norm": 0.004648784641176462, + "learning_rate": 0.27586043757698014, + "loss": 0.1498, + "num_input_tokens_seen": 6628208, + "step": 7325 + }, + { + "epoch": 1.9346707140029036, + "grad_norm": 0.003925797995179892, + "learning_rate": 0.27582838220784534, + "loss": 0.088, + "num_input_tokens_seen": 6632432, + "step": 7330 + }, + { + "epoch": 1.9359904975584006, + "grad_norm": 0.003936257213354111, + "learning_rate": 0.27579630743439265, + "loss": 0.1344, + "num_input_tokens_seen": 6637072, + "step": 7335 + }, + { + "epoch": 1.9373102811138974, + "grad_norm": 0.003006245940923691, + "learning_rate": 0.2757642132615686, + "loss": 0.1132, + "num_input_tokens_seen": 6641520, + "step": 7340 + }, + { + "epoch": 1.9386300646693941, + "grad_norm": 0.00568905845284462, + "learning_rate": 0.2757320996943223, + "loss": 0.1089, + "num_input_tokens_seen": 6645840, + "step": 7345 + }, + { + "epoch": 1.9399498482248911, + "grad_norm": 0.0028919395990669727, + "learning_rate": 0.2756999667376062, + "loss": 0.1031, + "num_input_tokens_seen": 6650320, + "step": 7350 + }, + { + "epoch": 1.9412696317803881, + "grad_norm": 0.0029269487131386995, + "learning_rate": 0.2756678143963756, + "loss": 0.1076, + "num_input_tokens_seen": 6654960, + "step": 7355 + }, + { + "epoch": 1.942589415335885, + "grad_norm": 0.0036051813513040543, + "learning_rate": 0.2756356426755888, + "loss": 0.1213, + "num_input_tokens_seen": 6659696, + "step": 7360 + }, + { + "epoch": 1.9439091988913817, + "grad_norm": 0.003478472586721182, + "learning_rate": 0.27560345158020705, + "loss": 0.1063, + "num_input_tokens_seen": 6664048, + "step": 7365 + }, + { + "epoch": 1.9452289824468787, + "grad_norm": 0.004023674409836531, + "learning_rate": 0.27557124111519465, + "loss": 0.1419, + "num_input_tokens_seen": 6668752, + "step": 7370 + }, + { + "epoch": 1.9465487660023757, + "grad_norm": 0.0034566924441605806, + "learning_rate": 0.27553901128551883, + "loss": 0.11, + "num_input_tokens_seen": 6673040, + "step": 7375 + }, + { + "epoch": 1.9478685495578725, + "grad_norm": 0.0021149530075490475, + "learning_rate": 0.2755067620961498, + "loss": 0.1261, + "num_input_tokens_seen": 6677520, + "step": 7380 + }, + { + "epoch": 1.9491883331133693, + "grad_norm": 0.004068438429385424, + "learning_rate": 0.27547449355206094, + "loss": 0.16, + "num_input_tokens_seen": 6682192, + "step": 7385 + }, + { + "epoch": 1.9505081166688663, + "grad_norm": 0.002915971912443638, + "learning_rate": 0.2754422056582283, + "loss": 0.1386, + "num_input_tokens_seen": 6686832, + "step": 7390 + }, + { + "epoch": 1.9518279002243633, + "grad_norm": 0.002475554123520851, + "learning_rate": 0.27540989841963115, + "loss": 0.1227, + "num_input_tokens_seen": 6691184, + "step": 7395 + }, + { + "epoch": 1.9531476837798603, + "grad_norm": 0.0033527754712849855, + "learning_rate": 0.27537757184125167, + "loss": 0.1209, + "num_input_tokens_seen": 6695472, + "step": 7400 + }, + { + "epoch": 1.9531476837798603, + "eval_loss": 0.11582686752080917, + "eval_runtime": 75.8788, + "eval_samples_per_second": 88.76, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 6695472, + "step": 7400 + }, + { + "epoch": 1.954467467335357, + "grad_norm": 0.004243124276399612, + "learning_rate": 0.275345225928075, + "loss": 0.1581, + "num_input_tokens_seen": 6699888, + "step": 7405 + }, + { + "epoch": 1.9557872508908538, + "grad_norm": 0.0016535002505406737, + "learning_rate": 0.2753128606850893, + "loss": 0.1112, + "num_input_tokens_seen": 6704400, + "step": 7410 + }, + { + "epoch": 1.9571070344463508, + "grad_norm": 0.004251670558005571, + "learning_rate": 0.2752804761172858, + "loss": 0.136, + "num_input_tokens_seen": 6709104, + "step": 7415 + }, + { + "epoch": 1.9584268180018478, + "grad_norm": 0.0023548875469714403, + "learning_rate": 0.27524807222965836, + "loss": 0.1223, + "num_input_tokens_seen": 6713776, + "step": 7420 + }, + { + "epoch": 1.9597466015573446, + "grad_norm": 0.002915344201028347, + "learning_rate": 0.27521564902720436, + "loss": 0.1067, + "num_input_tokens_seen": 6718192, + "step": 7425 + }, + { + "epoch": 1.9610663851128414, + "grad_norm": 0.0035515509080141783, + "learning_rate": 0.2751832065149236, + "loss": 0.1495, + "num_input_tokens_seen": 6722832, + "step": 7430 + }, + { + "epoch": 1.9623861686683384, + "grad_norm": 0.0034769554622471333, + "learning_rate": 0.2751507446978193, + "loss": 0.1001, + "num_input_tokens_seen": 6727472, + "step": 7435 + }, + { + "epoch": 1.9637059522238354, + "grad_norm": 0.002926889341324568, + "learning_rate": 0.2751182635808974, + "loss": 0.1172, + "num_input_tokens_seen": 6731952, + "step": 7440 + }, + { + "epoch": 1.9650257357793321, + "grad_norm": 0.0030843771528452635, + "learning_rate": 0.27508576316916694, + "loss": 0.1029, + "num_input_tokens_seen": 6736144, + "step": 7445 + }, + { + "epoch": 1.966345519334829, + "grad_norm": 0.0021859409753233194, + "learning_rate": 0.2750532434676399, + "loss": 0.0954, + "num_input_tokens_seen": 6740784, + "step": 7450 + }, + { + "epoch": 1.967665302890326, + "grad_norm": 0.005423896946012974, + "learning_rate": 0.27502070448133115, + "loss": 0.1305, + "num_input_tokens_seen": 6745328, + "step": 7455 + }, + { + "epoch": 1.968985086445823, + "grad_norm": 0.0032049643341451883, + "learning_rate": 0.2749881462152587, + "loss": 0.0984, + "num_input_tokens_seen": 6749872, + "step": 7460 + }, + { + "epoch": 1.97030487000132, + "grad_norm": 0.0035658914130181074, + "learning_rate": 0.2749555686744434, + "loss": 0.1371, + "num_input_tokens_seen": 6754192, + "step": 7465 + }, + { + "epoch": 1.9716246535568167, + "grad_norm": 0.004262909293174744, + "learning_rate": 0.2749229718639091, + "loss": 0.1149, + "num_input_tokens_seen": 6758992, + "step": 7470 + }, + { + "epoch": 1.9729444371123135, + "grad_norm": 0.0044702570885419846, + "learning_rate": 0.27489035578868265, + "loss": 0.0727, + "num_input_tokens_seen": 6763568, + "step": 7475 + }, + { + "epoch": 1.9742642206678105, + "grad_norm": 0.002375885145738721, + "learning_rate": 0.2748577204537939, + "loss": 0.0608, + "num_input_tokens_seen": 6768048, + "step": 7480 + }, + { + "epoch": 1.9755840042233075, + "grad_norm": 0.0014687731163576245, + "learning_rate": 0.2748250658642756, + "loss": 0.1263, + "num_input_tokens_seen": 6772304, + "step": 7485 + }, + { + "epoch": 1.9769037877788043, + "grad_norm": 0.0017525979783385992, + "learning_rate": 0.2747923920251634, + "loss": 0.1171, + "num_input_tokens_seen": 6776688, + "step": 7490 + }, + { + "epoch": 1.978223571334301, + "grad_norm": 0.004794395063072443, + "learning_rate": 0.27475969894149627, + "loss": 0.1307, + "num_input_tokens_seen": 6780880, + "step": 7495 + }, + { + "epoch": 1.979543354889798, + "grad_norm": 0.00435681501403451, + "learning_rate": 0.2747269866183156, + "loss": 0.0949, + "num_input_tokens_seen": 6785424, + "step": 7500 + }, + { + "epoch": 1.980863138445295, + "grad_norm": 0.0037961930502206087, + "learning_rate": 0.27469425506066625, + "loss": 0.114, + "num_input_tokens_seen": 6789840, + "step": 7505 + }, + { + "epoch": 1.9821829220007918, + "grad_norm": 0.004428061190992594, + "learning_rate": 0.27466150427359576, + "loss": 0.1031, + "num_input_tokens_seen": 6794544, + "step": 7510 + }, + { + "epoch": 1.9835027055562886, + "grad_norm": 0.005707364063709974, + "learning_rate": 0.2746287342621547, + "loss": 0.1111, + "num_input_tokens_seen": 6799216, + "step": 7515 + }, + { + "epoch": 1.9848224891117856, + "grad_norm": 0.002806875854730606, + "learning_rate": 0.2745959450313966, + "loss": 0.0843, + "num_input_tokens_seen": 6803632, + "step": 7520 + }, + { + "epoch": 1.9861422726672826, + "grad_norm": 0.0038948715664446354, + "learning_rate": 0.27456313658637804, + "loss": 0.1055, + "num_input_tokens_seen": 6808240, + "step": 7525 + }, + { + "epoch": 1.9874620562227796, + "grad_norm": 0.006429863628000021, + "learning_rate": 0.27453030893215846, + "loss": 0.141, + "num_input_tokens_seen": 6812912, + "step": 7530 + }, + { + "epoch": 1.9887818397782764, + "grad_norm": 0.002656359691172838, + "learning_rate": 0.2744974620738003, + "loss": 0.1448, + "num_input_tokens_seen": 6817456, + "step": 7535 + }, + { + "epoch": 1.9901016233337732, + "grad_norm": 0.0030079532880336046, + "learning_rate": 0.27446459601636897, + "loss": 0.1123, + "num_input_tokens_seen": 6821968, + "step": 7540 + }, + { + "epoch": 1.9914214068892702, + "grad_norm": 0.008682352490723133, + "learning_rate": 0.2744317107649328, + "loss": 0.1636, + "num_input_tokens_seen": 6826384, + "step": 7545 + }, + { + "epoch": 1.9927411904447672, + "grad_norm": 0.0024013868533074856, + "learning_rate": 0.2743988063245631, + "loss": 0.1157, + "num_input_tokens_seen": 6830896, + "step": 7550 + }, + { + "epoch": 1.994060974000264, + "grad_norm": 0.0021916846744716167, + "learning_rate": 0.2743658827003342, + "loss": 0.188, + "num_input_tokens_seen": 6835376, + "step": 7555 + }, + { + "epoch": 1.9953807575557607, + "grad_norm": 0.004051774740219116, + "learning_rate": 0.27433293989732327, + "loss": 0.1278, + "num_input_tokens_seen": 6839728, + "step": 7560 + }, + { + "epoch": 1.9967005411112577, + "grad_norm": 0.00299115851521492, + "learning_rate": 0.27429997792061056, + "loss": 0.1156, + "num_input_tokens_seen": 6844144, + "step": 7565 + }, + { + "epoch": 1.9980203246667547, + "grad_norm": 0.0019493825966492295, + "learning_rate": 0.27426699677527927, + "loss": 0.1356, + "num_input_tokens_seen": 6848400, + "step": 7570 + }, + { + "epoch": 1.9993401082222515, + "grad_norm": 0.0031005688942968845, + "learning_rate": 0.2742339964664154, + "loss": 0.1286, + "num_input_tokens_seen": 6853008, + "step": 7575 + }, + { + "epoch": 2.0005279134221987, + "grad_norm": 0.0020880666561424732, + "learning_rate": 0.274200976999108, + "loss": 0.1124, + "num_input_tokens_seen": 6857040, + "step": 7580 + }, + { + "epoch": 2.0018476969776957, + "grad_norm": 0.0029990533366799355, + "learning_rate": 0.27416793837844916, + "loss": 0.1012, + "num_input_tokens_seen": 6861424, + "step": 7585 + }, + { + "epoch": 2.0031674805331927, + "grad_norm": 0.0025153490714728832, + "learning_rate": 0.27413488060953384, + "loss": 0.0905, + "num_input_tokens_seen": 6865904, + "step": 7590 + }, + { + "epoch": 2.0044872640886893, + "grad_norm": 0.0015223427908495069, + "learning_rate": 0.27410180369745996, + "loss": 0.1166, + "num_input_tokens_seen": 6870672, + "step": 7595 + }, + { + "epoch": 2.0058070476441863, + "grad_norm": 0.0021136056166142225, + "learning_rate": 0.27406870764732844, + "loss": 0.1409, + "num_input_tokens_seen": 6875184, + "step": 7600 + }, + { + "epoch": 2.0058070476441863, + "eval_loss": 0.11441688984632492, + "eval_runtime": 75.8477, + "eval_samples_per_second": 88.796, + "eval_steps_per_second": 22.202, + "num_input_tokens_seen": 6875184, + "step": 7600 + }, + { + "epoch": 2.0071268311996833, + "grad_norm": 0.0026120473630726337, + "learning_rate": 0.27403559246424297, + "loss": 0.1119, + "num_input_tokens_seen": 6879696, + "step": 7605 + }, + { + "epoch": 2.0084466147551803, + "grad_norm": 0.0031459543388336897, + "learning_rate": 0.2740024581533105, + "loss": 0.0926, + "num_input_tokens_seen": 6884592, + "step": 7610 + }, + { + "epoch": 2.009766398310677, + "grad_norm": 0.003948332741856575, + "learning_rate": 0.2739693047196406, + "loss": 0.0875, + "num_input_tokens_seen": 6889136, + "step": 7615 + }, + { + "epoch": 2.011086181866174, + "grad_norm": 0.002122984966263175, + "learning_rate": 0.27393613216834606, + "loss": 0.1271, + "num_input_tokens_seen": 6893712, + "step": 7620 + }, + { + "epoch": 2.012405965421671, + "grad_norm": 0.0028304425068199635, + "learning_rate": 0.2739029405045424, + "loss": 0.0819, + "num_input_tokens_seen": 6898064, + "step": 7625 + }, + { + "epoch": 2.013725748977168, + "grad_norm": 0.00462275231257081, + "learning_rate": 0.2738697297333483, + "loss": 0.0713, + "num_input_tokens_seen": 6902512, + "step": 7630 + }, + { + "epoch": 2.015045532532665, + "grad_norm": 0.0030395984649658203, + "learning_rate": 0.2738364998598852, + "loss": 0.0782, + "num_input_tokens_seen": 6907248, + "step": 7635 + }, + { + "epoch": 2.0163653160881614, + "grad_norm": 0.005041665863245726, + "learning_rate": 0.27380325088927765, + "loss": 0.1069, + "num_input_tokens_seen": 6911824, + "step": 7640 + }, + { + "epoch": 2.0176850996436584, + "grad_norm": 0.006142966914921999, + "learning_rate": 0.27376998282665294, + "loss": 0.0793, + "num_input_tokens_seen": 6916464, + "step": 7645 + }, + { + "epoch": 2.0190048831991554, + "grad_norm": 0.00507360277697444, + "learning_rate": 0.27373669567714154, + "loss": 0.0891, + "num_input_tokens_seen": 6921360, + "step": 7650 + }, + { + "epoch": 2.0203246667546524, + "grad_norm": 0.0035906857810914516, + "learning_rate": 0.27370338944587663, + "loss": 0.1058, + "num_input_tokens_seen": 6925904, + "step": 7655 + }, + { + "epoch": 2.021644450310149, + "grad_norm": 0.0034333462826907635, + "learning_rate": 0.27367006413799455, + "loss": 0.1173, + "num_input_tokens_seen": 6930512, + "step": 7660 + }, + { + "epoch": 2.022964233865646, + "grad_norm": 0.0018301672535017133, + "learning_rate": 0.2736367197586345, + "loss": 0.1015, + "num_input_tokens_seen": 6935056, + "step": 7665 + }, + { + "epoch": 2.024284017421143, + "grad_norm": 0.0032540038228034973, + "learning_rate": 0.2736033563129385, + "loss": 0.108, + "num_input_tokens_seen": 6939216, + "step": 7670 + }, + { + "epoch": 2.02560380097664, + "grad_norm": 0.004731602966785431, + "learning_rate": 0.27356997380605164, + "loss": 0.1336, + "num_input_tokens_seen": 6943760, + "step": 7675 + }, + { + "epoch": 2.026923584532137, + "grad_norm": 0.002597480546683073, + "learning_rate": 0.27353657224312194, + "loss": 0.0979, + "num_input_tokens_seen": 6948368, + "step": 7680 + }, + { + "epoch": 2.0282433680876335, + "grad_norm": 0.0037440622691065073, + "learning_rate": 0.2735031516293004, + "loss": 0.1291, + "num_input_tokens_seen": 6952752, + "step": 7685 + }, + { + "epoch": 2.0295631516431305, + "grad_norm": 0.0027474602684378624, + "learning_rate": 0.2734697119697408, + "loss": 0.0746, + "num_input_tokens_seen": 6957520, + "step": 7690 + }, + { + "epoch": 2.0308829351986275, + "grad_norm": 0.004530278965830803, + "learning_rate": 0.27343625326959997, + "loss": 0.1068, + "num_input_tokens_seen": 6961936, + "step": 7695 + }, + { + "epoch": 2.0322027187541245, + "grad_norm": 0.002356254030019045, + "learning_rate": 0.27340277553403775, + "loss": 0.0763, + "num_input_tokens_seen": 6966640, + "step": 7700 + }, + { + "epoch": 2.033522502309621, + "grad_norm": 0.0022097842302173376, + "learning_rate": 0.2733692787682167, + "loss": 0.1053, + "num_input_tokens_seen": 6971024, + "step": 7705 + }, + { + "epoch": 2.034842285865118, + "grad_norm": 0.0026242888998240232, + "learning_rate": 0.27333576297730255, + "loss": 0.073, + "num_input_tokens_seen": 6975536, + "step": 7710 + }, + { + "epoch": 2.036162069420615, + "grad_norm": 0.0072287158109247684, + "learning_rate": 0.2733022281664638, + "loss": 0.0935, + "num_input_tokens_seen": 6980336, + "step": 7715 + }, + { + "epoch": 2.037481852976112, + "grad_norm": 0.002424800070002675, + "learning_rate": 0.273268674340872, + "loss": 0.091, + "num_input_tokens_seen": 6984784, + "step": 7720 + }, + { + "epoch": 2.0388016365316086, + "grad_norm": 0.001986142946407199, + "learning_rate": 0.27323510150570146, + "loss": 0.0639, + "num_input_tokens_seen": 6989360, + "step": 7725 + }, + { + "epoch": 2.0401214200871056, + "grad_norm": 0.00485910614952445, + "learning_rate": 0.27320150966612966, + "loss": 0.1105, + "num_input_tokens_seen": 6993904, + "step": 7730 + }, + { + "epoch": 2.0414412036426026, + "grad_norm": 0.004698901902884245, + "learning_rate": 0.2731678988273368, + "loss": 0.0885, + "num_input_tokens_seen": 6998544, + "step": 7735 + }, + { + "epoch": 2.0427609871980996, + "grad_norm": 0.0014744247309863567, + "learning_rate": 0.27313426899450605, + "loss": 0.1052, + "num_input_tokens_seen": 7003152, + "step": 7740 + }, + { + "epoch": 2.0440807707535966, + "grad_norm": 0.005518476944416761, + "learning_rate": 0.27310062017282366, + "loss": 0.1223, + "num_input_tokens_seen": 7007856, + "step": 7745 + }, + { + "epoch": 2.045400554309093, + "grad_norm": 0.005026519298553467, + "learning_rate": 0.2730669523674787, + "loss": 0.1252, + "num_input_tokens_seen": 7012784, + "step": 7750 + }, + { + "epoch": 2.04672033786459, + "grad_norm": 0.00401030108332634, + "learning_rate": 0.2730332655836631, + "loss": 0.0886, + "num_input_tokens_seen": 7017392, + "step": 7755 + }, + { + "epoch": 2.048040121420087, + "grad_norm": 0.0030263725202530622, + "learning_rate": 0.2729995598265718, + "loss": 0.1191, + "num_input_tokens_seen": 7021776, + "step": 7760 + }, + { + "epoch": 2.049359904975584, + "grad_norm": 0.002824401017278433, + "learning_rate": 0.2729658351014027, + "loss": 0.1125, + "num_input_tokens_seen": 7026128, + "step": 7765 + }, + { + "epoch": 2.0506796885310807, + "grad_norm": 0.0024067764170467854, + "learning_rate": 0.27293209141335656, + "loss": 0.0861, + "num_input_tokens_seen": 7030512, + "step": 7770 + }, + { + "epoch": 2.0519994720865777, + "grad_norm": 0.00402925256639719, + "learning_rate": 0.27289832876763703, + "loss": 0.1347, + "num_input_tokens_seen": 7034960, + "step": 7775 + }, + { + "epoch": 2.0533192556420747, + "grad_norm": 0.0023569113109260798, + "learning_rate": 0.27286454716945074, + "loss": 0.1234, + "num_input_tokens_seen": 7039472, + "step": 7780 + }, + { + "epoch": 2.0546390391975717, + "grad_norm": 0.005406923126429319, + "learning_rate": 0.27283074662400725, + "loss": 0.1461, + "num_input_tokens_seen": 7043856, + "step": 7785 + }, + { + "epoch": 2.0559588227530683, + "grad_norm": 0.003681051777675748, + "learning_rate": 0.2727969271365191, + "loss": 0.0563, + "num_input_tokens_seen": 7048336, + "step": 7790 + }, + { + "epoch": 2.0572786063085653, + "grad_norm": 0.0025117481127381325, + "learning_rate": 0.2727630887122016, + "loss": 0.1302, + "num_input_tokens_seen": 7052944, + "step": 7795 + }, + { + "epoch": 2.0585983898640623, + "grad_norm": 0.004164000973105431, + "learning_rate": 0.27272923135627314, + "loss": 0.1443, + "num_input_tokens_seen": 7057584, + "step": 7800 + }, + { + "epoch": 2.0585983898640623, + "eval_loss": 0.11482516676187515, + "eval_runtime": 75.8941, + "eval_samples_per_second": 88.742, + "eval_steps_per_second": 22.189, + "num_input_tokens_seen": 7057584, + "step": 7800 + }, + { + "epoch": 2.0599181734195593, + "grad_norm": 0.0022735295351594687, + "learning_rate": 0.2726953550739548, + "loss": 0.1036, + "num_input_tokens_seen": 7061808, + "step": 7805 + }, + { + "epoch": 2.0612379569750563, + "grad_norm": 0.002828357508406043, + "learning_rate": 0.27266145987047086, + "loss": 0.0747, + "num_input_tokens_seen": 7066320, + "step": 7810 + }, + { + "epoch": 2.062557740530553, + "grad_norm": 0.004263861570507288, + "learning_rate": 0.27262754575104836, + "loss": 0.0774, + "num_input_tokens_seen": 7070544, + "step": 7815 + }, + { + "epoch": 2.06387752408605, + "grad_norm": 0.0059275394305586815, + "learning_rate": 0.27259361272091726, + "loss": 0.0883, + "num_input_tokens_seen": 7074864, + "step": 7820 + }, + { + "epoch": 2.065197307641547, + "grad_norm": 0.00394726125523448, + "learning_rate": 0.27255966078531046, + "loss": 0.1092, + "num_input_tokens_seen": 7078992, + "step": 7825 + }, + { + "epoch": 2.066517091197044, + "grad_norm": 0.0025036924052983522, + "learning_rate": 0.2725256899494638, + "loss": 0.0986, + "num_input_tokens_seen": 7083280, + "step": 7830 + }, + { + "epoch": 2.0678368747525404, + "grad_norm": 0.003940202761441469, + "learning_rate": 0.272491700218616, + "loss": 0.1244, + "num_input_tokens_seen": 7087728, + "step": 7835 + }, + { + "epoch": 2.0691566583080374, + "grad_norm": 0.006051783449947834, + "learning_rate": 0.27245769159800876, + "loss": 0.1129, + "num_input_tokens_seen": 7092240, + "step": 7840 + }, + { + "epoch": 2.0704764418635344, + "grad_norm": 0.002911379560828209, + "learning_rate": 0.2724236640928865, + "loss": 0.1229, + "num_input_tokens_seen": 7096752, + "step": 7845 + }, + { + "epoch": 2.0717962254190314, + "grad_norm": 0.003620479255914688, + "learning_rate": 0.27238961770849673, + "loss": 0.1515, + "num_input_tokens_seen": 7101456, + "step": 7850 + }, + { + "epoch": 2.073116008974528, + "grad_norm": 0.0036804114934056997, + "learning_rate": 0.27235555245008997, + "loss": 0.1381, + "num_input_tokens_seen": 7105744, + "step": 7855 + }, + { + "epoch": 2.074435792530025, + "grad_norm": 0.0023003595415502787, + "learning_rate": 0.2723214683229193, + "loss": 0.0642, + "num_input_tokens_seen": 7110256, + "step": 7860 + }, + { + "epoch": 2.075755576085522, + "grad_norm": 0.0028518904000520706, + "learning_rate": 0.27228736533224107, + "loss": 0.1196, + "num_input_tokens_seen": 7114992, + "step": 7865 + }, + { + "epoch": 2.077075359641019, + "grad_norm": 0.0013454011641442776, + "learning_rate": 0.27225324348331437, + "loss": 0.0914, + "num_input_tokens_seen": 7119376, + "step": 7870 + }, + { + "epoch": 2.078395143196516, + "grad_norm": 0.004492990206927061, + "learning_rate": 0.27221910278140116, + "loss": 0.1292, + "num_input_tokens_seen": 7124272, + "step": 7875 + }, + { + "epoch": 2.0797149267520125, + "grad_norm": 0.002983746351674199, + "learning_rate": 0.2721849432317664, + "loss": 0.0571, + "num_input_tokens_seen": 7128848, + "step": 7880 + }, + { + "epoch": 2.0810347103075095, + "grad_norm": 0.006059369072318077, + "learning_rate": 0.2721507648396779, + "loss": 0.1398, + "num_input_tokens_seen": 7132976, + "step": 7885 + }, + { + "epoch": 2.0823544938630065, + "grad_norm": 0.006655285134911537, + "learning_rate": 0.27211656761040653, + "loss": 0.1083, + "num_input_tokens_seen": 7137584, + "step": 7890 + }, + { + "epoch": 2.0836742774185035, + "grad_norm": 0.004507059697061777, + "learning_rate": 0.2720823515492257, + "loss": 0.13, + "num_input_tokens_seen": 7142288, + "step": 7895 + }, + { + "epoch": 2.084994060974, + "grad_norm": 0.004073692951351404, + "learning_rate": 0.27204811666141215, + "loss": 0.1233, + "num_input_tokens_seen": 7147024, + "step": 7900 + }, + { + "epoch": 2.086313844529497, + "grad_norm": 0.0031537427566945553, + "learning_rate": 0.2720138629522452, + "loss": 0.1318, + "num_input_tokens_seen": 7151408, + "step": 7905 + }, + { + "epoch": 2.087633628084994, + "grad_norm": 0.0037395725958049297, + "learning_rate": 0.2719795904270073, + "loss": 0.1177, + "num_input_tokens_seen": 7155952, + "step": 7910 + }, + { + "epoch": 2.088953411640491, + "grad_norm": 0.002238951390609145, + "learning_rate": 0.2719452990909837, + "loss": 0.1, + "num_input_tokens_seen": 7160176, + "step": 7915 + }, + { + "epoch": 2.0902731951959876, + "grad_norm": 0.004159680567681789, + "learning_rate": 0.2719109889494625, + "loss": 0.1401, + "num_input_tokens_seen": 7164688, + "step": 7920 + }, + { + "epoch": 2.0915929787514846, + "grad_norm": 0.0022056768648326397, + "learning_rate": 0.27187666000773475, + "loss": 0.102, + "num_input_tokens_seen": 7169392, + "step": 7925 + }, + { + "epoch": 2.0929127623069816, + "grad_norm": 0.003346838755533099, + "learning_rate": 0.2718423122710944, + "loss": 0.1045, + "num_input_tokens_seen": 7173808, + "step": 7930 + }, + { + "epoch": 2.0942325458624786, + "grad_norm": 0.003201281651854515, + "learning_rate": 0.2718079457448384, + "loss": 0.1002, + "num_input_tokens_seen": 7178288, + "step": 7935 + }, + { + "epoch": 2.0955523294179756, + "grad_norm": 0.0024025095626711845, + "learning_rate": 0.27177356043426637, + "loss": 0.0743, + "num_input_tokens_seen": 7182352, + "step": 7940 + }, + { + "epoch": 2.096872112973472, + "grad_norm": 0.0023679158184677362, + "learning_rate": 0.27173915634468104, + "loss": 0.1053, + "num_input_tokens_seen": 7186864, + "step": 7945 + }, + { + "epoch": 2.098191896528969, + "grad_norm": 0.002762086456641555, + "learning_rate": 0.27170473348138796, + "loss": 0.0983, + "num_input_tokens_seen": 7191728, + "step": 7950 + }, + { + "epoch": 2.099511680084466, + "grad_norm": 0.005928804632276297, + "learning_rate": 0.27167029184969554, + "loss": 0.1409, + "num_input_tokens_seen": 7196560, + "step": 7955 + }, + { + "epoch": 2.100831463639963, + "grad_norm": 0.004522210452705622, + "learning_rate": 0.27163583145491504, + "loss": 0.1167, + "num_input_tokens_seen": 7201232, + "step": 7960 + }, + { + "epoch": 2.1021512471954598, + "grad_norm": 0.004742633085697889, + "learning_rate": 0.2716013523023608, + "loss": 0.1309, + "num_input_tokens_seen": 7205840, + "step": 7965 + }, + { + "epoch": 2.1034710307509568, + "grad_norm": 0.0029085655696690083, + "learning_rate": 0.27156685439734995, + "loss": 0.115, + "num_input_tokens_seen": 7210384, + "step": 7970 + }, + { + "epoch": 2.1047908143064538, + "grad_norm": 0.00355590065009892, + "learning_rate": 0.2715323377452024, + "loss": 0.1288, + "num_input_tokens_seen": 7214608, + "step": 7975 + }, + { + "epoch": 2.1061105978619508, + "grad_norm": 0.002942997496575117, + "learning_rate": 0.2714978023512411, + "loss": 0.0887, + "num_input_tokens_seen": 7219024, + "step": 7980 + }, + { + "epoch": 2.1074303814174478, + "grad_norm": 0.003827089909464121, + "learning_rate": 0.2714632482207918, + "loss": 0.1277, + "num_input_tokens_seen": 7223504, + "step": 7985 + }, + { + "epoch": 2.1087501649729443, + "grad_norm": 0.006851798854768276, + "learning_rate": 0.2714286753591833, + "loss": 0.1265, + "num_input_tokens_seen": 7227984, + "step": 7990 + }, + { + "epoch": 2.1100699485284413, + "grad_norm": 0.004453783854842186, + "learning_rate": 0.27139408377174706, + "loss": 0.0808, + "num_input_tokens_seen": 7232240, + "step": 7995 + }, + { + "epoch": 2.1113897320839383, + "grad_norm": 0.004503489471971989, + "learning_rate": 0.27135947346381756, + "loss": 0.1163, + "num_input_tokens_seen": 7236880, + "step": 8000 + }, + { + "epoch": 2.1113897320839383, + "eval_loss": 0.11742153018712997, + "eval_runtime": 75.898, + "eval_samples_per_second": 88.738, + "eval_steps_per_second": 22.188, + "num_input_tokens_seen": 7236880, + "step": 8000 + }, + { + "epoch": 2.1127095156394353, + "grad_norm": 0.003800778416916728, + "learning_rate": 0.2713248444407322, + "loss": 0.1425, + "num_input_tokens_seen": 7241808, + "step": 8005 + }, + { + "epoch": 2.114029299194932, + "grad_norm": 0.0032028821296989918, + "learning_rate": 0.27129019670783106, + "loss": 0.0859, + "num_input_tokens_seen": 7246480, + "step": 8010 + }, + { + "epoch": 2.115349082750429, + "grad_norm": 0.0020890177693217993, + "learning_rate": 0.27125553027045746, + "loss": 0.1085, + "num_input_tokens_seen": 7250800, + "step": 8015 + }, + { + "epoch": 2.116668866305926, + "grad_norm": 0.0029531323816627264, + "learning_rate": 0.2712208451339572, + "loss": 0.104, + "num_input_tokens_seen": 7255280, + "step": 8020 + }, + { + "epoch": 2.117988649861423, + "grad_norm": 0.002582130953669548, + "learning_rate": 0.27118614130367935, + "loss": 0.1489, + "num_input_tokens_seen": 7259632, + "step": 8025 + }, + { + "epoch": 2.1193084334169194, + "grad_norm": 0.004656744189560413, + "learning_rate": 0.2711514187849756, + "loss": 0.1128, + "num_input_tokens_seen": 7264016, + "step": 8030 + }, + { + "epoch": 2.1206282169724164, + "grad_norm": 0.0034528709948062897, + "learning_rate": 0.27111667758320057, + "loss": 0.0986, + "num_input_tokens_seen": 7268688, + "step": 8035 + }, + { + "epoch": 2.1219480005279134, + "grad_norm": 0.00518913846462965, + "learning_rate": 0.27108191770371176, + "loss": 0.1259, + "num_input_tokens_seen": 7273360, + "step": 8040 + }, + { + "epoch": 2.1232677840834104, + "grad_norm": 0.003116208827123046, + "learning_rate": 0.2710471391518697, + "loss": 0.1213, + "num_input_tokens_seen": 7277872, + "step": 8045 + }, + { + "epoch": 2.1245875676389074, + "grad_norm": 0.002338405465707183, + "learning_rate": 0.2710123419330375, + "loss": 0.1106, + "num_input_tokens_seen": 7282352, + "step": 8050 + }, + { + "epoch": 2.125907351194404, + "grad_norm": 0.004334870260208845, + "learning_rate": 0.2709775260525816, + "loss": 0.1126, + "num_input_tokens_seen": 7286800, + "step": 8055 + }, + { + "epoch": 2.127227134749901, + "grad_norm": 0.00335907656699419, + "learning_rate": 0.27094269151587075, + "loss": 0.1295, + "num_input_tokens_seen": 7291344, + "step": 8060 + }, + { + "epoch": 2.128546918305398, + "grad_norm": 0.0032388437539339066, + "learning_rate": 0.27090783832827703, + "loss": 0.121, + "num_input_tokens_seen": 7295984, + "step": 8065 + }, + { + "epoch": 2.129866701860895, + "grad_norm": 0.0018747353460639715, + "learning_rate": 0.2708729664951753, + "loss": 0.0789, + "num_input_tokens_seen": 7300496, + "step": 8070 + }, + { + "epoch": 2.1311864854163916, + "grad_norm": 0.0021483940072357655, + "learning_rate": 0.27083807602194304, + "loss": 0.1275, + "num_input_tokens_seen": 7305328, + "step": 8075 + }, + { + "epoch": 2.1325062689718886, + "grad_norm": 0.0037328600883483887, + "learning_rate": 0.270803166913961, + "loss": 0.1298, + "num_input_tokens_seen": 7309840, + "step": 8080 + }, + { + "epoch": 2.1338260525273856, + "grad_norm": 0.00437212036922574, + "learning_rate": 0.27076823917661247, + "loss": 0.1083, + "num_input_tokens_seen": 7314416, + "step": 8085 + }, + { + "epoch": 2.1351458360828826, + "grad_norm": 0.004670802038162947, + "learning_rate": 0.2707332928152838, + "loss": 0.1363, + "num_input_tokens_seen": 7318928, + "step": 8090 + }, + { + "epoch": 2.136465619638379, + "grad_norm": 0.004293084144592285, + "learning_rate": 0.2706983278353641, + "loss": 0.1273, + "num_input_tokens_seen": 7323344, + "step": 8095 + }, + { + "epoch": 2.137785403193876, + "grad_norm": 0.0023639234714210033, + "learning_rate": 0.27066334424224553, + "loss": 0.1344, + "num_input_tokens_seen": 7327696, + "step": 8100 + }, + { + "epoch": 2.139105186749373, + "grad_norm": 0.002716309856623411, + "learning_rate": 0.27062834204132297, + "loss": 0.1434, + "num_input_tokens_seen": 7332528, + "step": 8105 + }, + { + "epoch": 2.14042497030487, + "grad_norm": 0.004068536218255758, + "learning_rate": 0.27059332123799407, + "loss": 0.1698, + "num_input_tokens_seen": 7337232, + "step": 8110 + }, + { + "epoch": 2.141744753860367, + "grad_norm": 0.005915376357734203, + "learning_rate": 0.27055828183765956, + "loss": 0.1314, + "num_input_tokens_seen": 7341456, + "step": 8115 + }, + { + "epoch": 2.1430645374158637, + "grad_norm": 0.003360015107318759, + "learning_rate": 0.270523223845723, + "loss": 0.0937, + "num_input_tokens_seen": 7345968, + "step": 8120 + }, + { + "epoch": 2.1443843209713607, + "grad_norm": 0.00212475867010653, + "learning_rate": 0.2704881472675907, + "loss": 0.0835, + "num_input_tokens_seen": 7350512, + "step": 8125 + }, + { + "epoch": 2.1457041045268577, + "grad_norm": 0.004038514569401741, + "learning_rate": 0.270453052108672, + "loss": 0.0975, + "num_input_tokens_seen": 7355056, + "step": 8130 + }, + { + "epoch": 2.1470238880823547, + "grad_norm": 0.0027848565950989723, + "learning_rate": 0.2704179383743789, + "loss": 0.117, + "num_input_tokens_seen": 7359984, + "step": 8135 + }, + { + "epoch": 2.1483436716378512, + "grad_norm": 0.006819409783929586, + "learning_rate": 0.27038280607012644, + "loss": 0.1023, + "num_input_tokens_seen": 7364432, + "step": 8140 + }, + { + "epoch": 2.1496634551933482, + "grad_norm": 0.0034184062387794256, + "learning_rate": 0.27034765520133247, + "loss": 0.0885, + "num_input_tokens_seen": 7369360, + "step": 8145 + }, + { + "epoch": 2.1509832387488452, + "grad_norm": 0.003417749423533678, + "learning_rate": 0.2703124857734177, + "loss": 0.1021, + "num_input_tokens_seen": 7373872, + "step": 8150 + }, + { + "epoch": 2.1523030223043422, + "grad_norm": 0.0032616702374070883, + "learning_rate": 0.27027729779180565, + "loss": 0.1103, + "num_input_tokens_seen": 7378128, + "step": 8155 + }, + { + "epoch": 2.153622805859839, + "grad_norm": 0.006032530218362808, + "learning_rate": 0.27024209126192283, + "loss": 0.1206, + "num_input_tokens_seen": 7382320, + "step": 8160 + }, + { + "epoch": 2.154942589415336, + "grad_norm": 0.006499054841697216, + "learning_rate": 0.2702068661891984, + "loss": 0.0955, + "num_input_tokens_seen": 7386928, + "step": 8165 + }, + { + "epoch": 2.156262372970833, + "grad_norm": 0.002395150251686573, + "learning_rate": 0.2701716225790647, + "loss": 0.0748, + "num_input_tokens_seen": 7391120, + "step": 8170 + }, + { + "epoch": 2.15758215652633, + "grad_norm": 0.0028330169152468443, + "learning_rate": 0.27013636043695655, + "loss": 0.1263, + "num_input_tokens_seen": 7395728, + "step": 8175 + }, + { + "epoch": 2.158901940081827, + "grad_norm": 0.0038624436128884554, + "learning_rate": 0.27010107976831194, + "loss": 0.0744, + "num_input_tokens_seen": 7400112, + "step": 8180 + }, + { + "epoch": 2.1602217236373233, + "grad_norm": 0.004116017837077379, + "learning_rate": 0.2700657805785715, + "loss": 0.1061, + "num_input_tokens_seen": 7404880, + "step": 8185 + }, + { + "epoch": 2.1615415071928203, + "grad_norm": 0.0033797260839492083, + "learning_rate": 0.2700304628731789, + "loss": 0.1021, + "num_input_tokens_seen": 7409360, + "step": 8190 + }, + { + "epoch": 2.1628612907483173, + "grad_norm": 0.0031785129103809595, + "learning_rate": 0.26999512665758046, + "loss": 0.0879, + "num_input_tokens_seen": 7414000, + "step": 8195 + }, + { + "epoch": 2.1641810743038143, + "grad_norm": 0.002698357915505767, + "learning_rate": 0.2699597719372256, + "loss": 0.0961, + "num_input_tokens_seen": 7418160, + "step": 8200 + }, + { + "epoch": 2.1641810743038143, + "eval_loss": 0.12959469854831696, + "eval_runtime": 75.9694, + "eval_samples_per_second": 88.654, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 7418160, + "step": 8200 + }, + { + "epoch": 2.165500857859311, + "grad_norm": 0.0034466064535081387, + "learning_rate": 0.26992439871756635, + "loss": 0.1141, + "num_input_tokens_seen": 7422480, + "step": 8205 + }, + { + "epoch": 2.166820641414808, + "grad_norm": 0.0025018611922860146, + "learning_rate": 0.2698890070040578, + "loss": 0.0794, + "num_input_tokens_seen": 7426896, + "step": 8210 + }, + { + "epoch": 2.168140424970305, + "grad_norm": 0.006609665229916573, + "learning_rate": 0.2698535968021577, + "loss": 0.1275, + "num_input_tokens_seen": 7431600, + "step": 8215 + }, + { + "epoch": 2.169460208525802, + "grad_norm": 0.004092466086149216, + "learning_rate": 0.26981816811732684, + "loss": 0.1144, + "num_input_tokens_seen": 7436176, + "step": 8220 + }, + { + "epoch": 2.1707799920812985, + "grad_norm": 0.0034118543844670057, + "learning_rate": 0.26978272095502875, + "loss": 0.1259, + "num_input_tokens_seen": 7440912, + "step": 8225 + }, + { + "epoch": 2.1720997756367955, + "grad_norm": 0.0016921692294999957, + "learning_rate": 0.26974725532072974, + "loss": 0.0669, + "num_input_tokens_seen": 7445456, + "step": 8230 + }, + { + "epoch": 2.1734195591922925, + "grad_norm": 0.0034486553631722927, + "learning_rate": 0.26971177121989914, + "loss": 0.1055, + "num_input_tokens_seen": 7450032, + "step": 8235 + }, + { + "epoch": 2.1747393427477895, + "grad_norm": 0.0028848699294030666, + "learning_rate": 0.2696762686580091, + "loss": 0.0868, + "num_input_tokens_seen": 7454576, + "step": 8240 + }, + { + "epoch": 2.1760591263032865, + "grad_norm": 0.00350133772008121, + "learning_rate": 0.26964074764053436, + "loss": 0.1036, + "num_input_tokens_seen": 7458928, + "step": 8245 + }, + { + "epoch": 2.177378909858783, + "grad_norm": 0.0047735306434333324, + "learning_rate": 0.2696052081729529, + "loss": 0.1136, + "num_input_tokens_seen": 7463472, + "step": 8250 + }, + { + "epoch": 2.17869869341428, + "grad_norm": 0.0020554503425955772, + "learning_rate": 0.2695696502607453, + "loss": 0.0666, + "num_input_tokens_seen": 7467888, + "step": 8255 + }, + { + "epoch": 2.180018476969777, + "grad_norm": 0.004881252069026232, + "learning_rate": 0.26953407390939504, + "loss": 0.0962, + "num_input_tokens_seen": 7472336, + "step": 8260 + }, + { + "epoch": 2.181338260525274, + "grad_norm": 0.0035058718640357256, + "learning_rate": 0.26949847912438835, + "loss": 0.0757, + "num_input_tokens_seen": 7476560, + "step": 8265 + }, + { + "epoch": 2.1826580440807706, + "grad_norm": 0.0036117667332291603, + "learning_rate": 0.26946286591121454, + "loss": 0.1254, + "num_input_tokens_seen": 7480688, + "step": 8270 + }, + { + "epoch": 2.1839778276362676, + "grad_norm": 0.005459676031023264, + "learning_rate": 0.2694272342753655, + "loss": 0.1052, + "num_input_tokens_seen": 7485040, + "step": 8275 + }, + { + "epoch": 2.1852976111917646, + "grad_norm": 0.004640407860279083, + "learning_rate": 0.26939158422233617, + "loss": 0.1534, + "num_input_tokens_seen": 7489520, + "step": 8280 + }, + { + "epoch": 2.1866173947472616, + "grad_norm": 0.003953509498387575, + "learning_rate": 0.26935591575762413, + "loss": 0.1249, + "num_input_tokens_seen": 7494224, + "step": 8285 + }, + { + "epoch": 2.187937178302758, + "grad_norm": 0.0026284230407327414, + "learning_rate": 0.26932022888672996, + "loss": 0.099, + "num_input_tokens_seen": 7498672, + "step": 8290 + }, + { + "epoch": 2.189256961858255, + "grad_norm": 0.0017863778630271554, + "learning_rate": 0.26928452361515703, + "loss": 0.0839, + "num_input_tokens_seen": 7503088, + "step": 8295 + }, + { + "epoch": 2.190576745413752, + "grad_norm": 0.0033612593542784452, + "learning_rate": 0.26924879994841155, + "loss": 0.1486, + "num_input_tokens_seen": 7507888, + "step": 8300 + }, + { + "epoch": 2.191896528969249, + "grad_norm": 0.002436363371089101, + "learning_rate": 0.2692130578920025, + "loss": 0.1142, + "num_input_tokens_seen": 7512368, + "step": 8305 + }, + { + "epoch": 2.193216312524746, + "grad_norm": 0.00330582307651639, + "learning_rate": 0.26917729745144187, + "loss": 0.1153, + "num_input_tokens_seen": 7516816, + "step": 8310 + }, + { + "epoch": 2.1945360960802427, + "grad_norm": 0.0017904043197631836, + "learning_rate": 0.2691415186322443, + "loss": 0.1001, + "num_input_tokens_seen": 7521392, + "step": 8315 + }, + { + "epoch": 2.1958558796357397, + "grad_norm": 0.0031994341406971216, + "learning_rate": 0.2691057214399273, + "loss": 0.0833, + "num_input_tokens_seen": 7526256, + "step": 8320 + }, + { + "epoch": 2.1971756631912367, + "grad_norm": 0.003182962303981185, + "learning_rate": 0.2690699058800113, + "loss": 0.1269, + "num_input_tokens_seen": 7530672, + "step": 8325 + }, + { + "epoch": 2.1984954467467337, + "grad_norm": 0.005622101482003927, + "learning_rate": 0.2690340719580194, + "loss": 0.1176, + "num_input_tokens_seen": 7535440, + "step": 8330 + }, + { + "epoch": 2.1998152303022303, + "grad_norm": 0.0016182934632524848, + "learning_rate": 0.2689982196794778, + "loss": 0.0771, + "num_input_tokens_seen": 7539824, + "step": 8335 + }, + { + "epoch": 2.2011350138577273, + "grad_norm": 0.0048918696120381355, + "learning_rate": 0.2689623490499153, + "loss": 0.1174, + "num_input_tokens_seen": 7544368, + "step": 8340 + }, + { + "epoch": 2.2024547974132243, + "grad_norm": 0.0025668710004538298, + "learning_rate": 0.2689264600748636, + "loss": 0.0997, + "num_input_tokens_seen": 7548912, + "step": 8345 + }, + { + "epoch": 2.2037745809687213, + "grad_norm": 0.003992327023297548, + "learning_rate": 0.26889055275985724, + "loss": 0.1412, + "num_input_tokens_seen": 7553328, + "step": 8350 + }, + { + "epoch": 2.205094364524218, + "grad_norm": 0.0035885947290807962, + "learning_rate": 0.2688546271104335, + "loss": 0.0615, + "num_input_tokens_seen": 7558096, + "step": 8355 + }, + { + "epoch": 2.206414148079715, + "grad_norm": 0.0018969490192830563, + "learning_rate": 0.26881868313213275, + "loss": 0.0935, + "num_input_tokens_seen": 7562992, + "step": 8360 + }, + { + "epoch": 2.207733931635212, + "grad_norm": 0.004182522650808096, + "learning_rate": 0.2687827208304978, + "loss": 0.1127, + "num_input_tokens_seen": 7567664, + "step": 8365 + }, + { + "epoch": 2.209053715190709, + "grad_norm": 0.0009938282892107964, + "learning_rate": 0.26874674021107464, + "loss": 0.1156, + "num_input_tokens_seen": 7571824, + "step": 8370 + }, + { + "epoch": 2.210373498746206, + "grad_norm": 0.0008315101731568575, + "learning_rate": 0.2687107412794118, + "loss": 0.104, + "num_input_tokens_seen": 7576176, + "step": 8375 + }, + { + "epoch": 2.2116932823017024, + "grad_norm": 0.004578572232276201, + "learning_rate": 0.26867472404106096, + "loss": 0.1155, + "num_input_tokens_seen": 7580880, + "step": 8380 + }, + { + "epoch": 2.2130130658571994, + "grad_norm": 0.003867862746119499, + "learning_rate": 0.26863868850157624, + "loss": 0.1001, + "num_input_tokens_seen": 7585424, + "step": 8385 + }, + { + "epoch": 2.2143328494126964, + "grad_norm": 0.004569703247398138, + "learning_rate": 0.26860263466651485, + "loss": 0.1159, + "num_input_tokens_seen": 7589968, + "step": 8390 + }, + { + "epoch": 2.2156526329681934, + "grad_norm": 0.0023524679709225893, + "learning_rate": 0.26856656254143674, + "loss": 0.125, + "num_input_tokens_seen": 7594256, + "step": 8395 + }, + { + "epoch": 2.21697241652369, + "grad_norm": 0.0041366759687662125, + "learning_rate": 0.2685304721319047, + "loss": 0.1211, + "num_input_tokens_seen": 7598544, + "step": 8400 + }, + { + "epoch": 2.21697241652369, + "eval_loss": 0.11270745098590851, + "eval_runtime": 75.8639, + "eval_samples_per_second": 88.777, + "eval_steps_per_second": 22.198, + "num_input_tokens_seen": 7598544, + "step": 8400 + }, + { + "epoch": 2.218292200079187, + "grad_norm": 0.0032321657054126263, + "learning_rate": 0.2684943634434843, + "loss": 0.1148, + "num_input_tokens_seen": 7603024, + "step": 8405 + }, + { + "epoch": 2.219611983634684, + "grad_norm": 0.0049490854144096375, + "learning_rate": 0.268458236481744, + "loss": 0.1093, + "num_input_tokens_seen": 7607728, + "step": 8410 + }, + { + "epoch": 2.220931767190181, + "grad_norm": 0.0025812757667154074, + "learning_rate": 0.2684220912522549, + "loss": 0.0768, + "num_input_tokens_seen": 7612368, + "step": 8415 + }, + { + "epoch": 2.2222515507456775, + "grad_norm": 0.001954148057848215, + "learning_rate": 0.2683859277605913, + "loss": 0.0948, + "num_input_tokens_seen": 7616848, + "step": 8420 + }, + { + "epoch": 2.2235713343011745, + "grad_norm": 0.0022418939042836428, + "learning_rate": 0.2683497460123298, + "loss": 0.1109, + "num_input_tokens_seen": 7621200, + "step": 8425 + }, + { + "epoch": 2.2248911178566715, + "grad_norm": 0.0036794389598071575, + "learning_rate": 0.26831354601305013, + "loss": 0.0807, + "num_input_tokens_seen": 7625680, + "step": 8430 + }, + { + "epoch": 2.2262109014121685, + "grad_norm": 0.003282705554738641, + "learning_rate": 0.26827732776833496, + "loss": 0.1077, + "num_input_tokens_seen": 7630512, + "step": 8435 + }, + { + "epoch": 2.2275306849676655, + "grad_norm": 0.0015249052084982395, + "learning_rate": 0.26824109128376944, + "loss": 0.0938, + "num_input_tokens_seen": 7634928, + "step": 8440 + }, + { + "epoch": 2.228850468523162, + "grad_norm": 0.0024279647041112185, + "learning_rate": 0.2682048365649417, + "loss": 0.0723, + "num_input_tokens_seen": 7639568, + "step": 8445 + }, + { + "epoch": 2.230170252078659, + "grad_norm": 0.004416347481310368, + "learning_rate": 0.2681685636174428, + "loss": 0.1301, + "num_input_tokens_seen": 7644112, + "step": 8450 + }, + { + "epoch": 2.231490035634156, + "grad_norm": 0.003341986332088709, + "learning_rate": 0.2681322724468663, + "loss": 0.1385, + "num_input_tokens_seen": 7648944, + "step": 8455 + }, + { + "epoch": 2.232809819189653, + "grad_norm": 0.002875532489269972, + "learning_rate": 0.2680959630588089, + "loss": 0.0878, + "num_input_tokens_seen": 7653168, + "step": 8460 + }, + { + "epoch": 2.2341296027451496, + "grad_norm": 0.0037752219941467047, + "learning_rate": 0.26805963545886985, + "loss": 0.0985, + "num_input_tokens_seen": 7657744, + "step": 8465 + }, + { + "epoch": 2.2354493863006466, + "grad_norm": 0.00233136466704309, + "learning_rate": 0.26802328965265143, + "loss": 0.0936, + "num_input_tokens_seen": 7662352, + "step": 8470 + }, + { + "epoch": 2.2367691698561436, + "grad_norm": 0.0030436788219958544, + "learning_rate": 0.26798692564575854, + "loss": 0.116, + "num_input_tokens_seen": 7666672, + "step": 8475 + }, + { + "epoch": 2.2380889534116406, + "grad_norm": 0.0018422731664031744, + "learning_rate": 0.26795054344379904, + "loss": 0.1015, + "num_input_tokens_seen": 7670832, + "step": 8480 + }, + { + "epoch": 2.239408736967137, + "grad_norm": 0.002440332667902112, + "learning_rate": 0.2679141430523835, + "loss": 0.1291, + "num_input_tokens_seen": 7675504, + "step": 8485 + }, + { + "epoch": 2.240728520522634, + "grad_norm": 0.002605888992547989, + "learning_rate": 0.2678777244771252, + "loss": 0.1282, + "num_input_tokens_seen": 7679728, + "step": 8490 + }, + { + "epoch": 2.242048304078131, + "grad_norm": 0.004100397694855928, + "learning_rate": 0.2678412877236405, + "loss": 0.1436, + "num_input_tokens_seen": 7684016, + "step": 8495 + }, + { + "epoch": 2.243368087633628, + "grad_norm": 0.0030137752182781696, + "learning_rate": 0.2678048327975484, + "loss": 0.1353, + "num_input_tokens_seen": 7688528, + "step": 8500 + }, + { + "epoch": 2.244687871189125, + "grad_norm": 0.0021892825607210398, + "learning_rate": 0.2677683597044706, + "loss": 0.0882, + "num_input_tokens_seen": 7692880, + "step": 8505 + }, + { + "epoch": 2.2460076547446217, + "grad_norm": 0.002356674987822771, + "learning_rate": 0.2677318684500318, + "loss": 0.1008, + "num_input_tokens_seen": 7697776, + "step": 8510 + }, + { + "epoch": 2.2473274383001187, + "grad_norm": 0.003554515540599823, + "learning_rate": 0.2676953590398593, + "loss": 0.1032, + "num_input_tokens_seen": 7702384, + "step": 8515 + }, + { + "epoch": 2.2486472218556157, + "grad_norm": 0.0015192761784419417, + "learning_rate": 0.2676588314795834, + "loss": 0.0889, + "num_input_tokens_seen": 7707056, + "step": 8520 + }, + { + "epoch": 2.2499670054111127, + "grad_norm": 0.003072360996156931, + "learning_rate": 0.26762228577483715, + "loss": 0.0936, + "num_input_tokens_seen": 7711664, + "step": 8525 + }, + { + "epoch": 2.2512867889666097, + "grad_norm": 0.002085602842271328, + "learning_rate": 0.2675857219312563, + "loss": 0.0829, + "num_input_tokens_seen": 7715696, + "step": 8530 + }, + { + "epoch": 2.2526065725221063, + "grad_norm": 0.0035540799144655466, + "learning_rate": 0.2675491399544794, + "loss": 0.1177, + "num_input_tokens_seen": 7720112, + "step": 8535 + }, + { + "epoch": 2.2539263560776033, + "grad_norm": 0.0028076169546693563, + "learning_rate": 0.2675125398501479, + "loss": 0.0938, + "num_input_tokens_seen": 7724624, + "step": 8540 + }, + { + "epoch": 2.2552461396331003, + "grad_norm": 0.0011349847773090005, + "learning_rate": 0.26747592162390604, + "loss": 0.1138, + "num_input_tokens_seen": 7728912, + "step": 8545 + }, + { + "epoch": 2.256565923188597, + "grad_norm": 0.003665928030386567, + "learning_rate": 0.26743928528140076, + "loss": 0.1181, + "num_input_tokens_seen": 7733392, + "step": 8550 + }, + { + "epoch": 2.257885706744094, + "grad_norm": 0.002420690143480897, + "learning_rate": 0.26740263082828186, + "loss": 0.0743, + "num_input_tokens_seen": 7737680, + "step": 8555 + }, + { + "epoch": 2.259205490299591, + "grad_norm": 0.0018562196055427194, + "learning_rate": 0.2673659582702019, + "loss": 0.121, + "num_input_tokens_seen": 7742096, + "step": 8560 + }, + { + "epoch": 2.260525273855088, + "grad_norm": 0.0030988063663244247, + "learning_rate": 0.2673292676128163, + "loss": 0.1268, + "num_input_tokens_seen": 7746288, + "step": 8565 + }, + { + "epoch": 2.261845057410585, + "grad_norm": 0.0030292512383311987, + "learning_rate": 0.2672925588617831, + "loss": 0.117, + "num_input_tokens_seen": 7750928, + "step": 8570 + }, + { + "epoch": 2.2631648409660814, + "grad_norm": 0.0022027839440852404, + "learning_rate": 0.2672558320227634, + "loss": 0.1236, + "num_input_tokens_seen": 7755408, + "step": 8575 + }, + { + "epoch": 2.2644846245215784, + "grad_norm": 0.00354271219111979, + "learning_rate": 0.2672190871014209, + "loss": 0.0991, + "num_input_tokens_seen": 7759824, + "step": 8580 + }, + { + "epoch": 2.2658044080770754, + "grad_norm": 0.00154802983161062, + "learning_rate": 0.267182324103422, + "loss": 0.0733, + "num_input_tokens_seen": 7764176, + "step": 8585 + }, + { + "epoch": 2.2671241916325724, + "grad_norm": 0.0017925123684108257, + "learning_rate": 0.2671455430344362, + "loss": 0.1126, + "num_input_tokens_seen": 7768816, + "step": 8590 + }, + { + "epoch": 2.2684439751880694, + "grad_norm": 0.0030629350803792477, + "learning_rate": 0.2671087439001355, + "loss": 0.1168, + "num_input_tokens_seen": 7773136, + "step": 8595 + }, + { + "epoch": 2.269763758743566, + "grad_norm": 0.0034710154868662357, + "learning_rate": 0.2670719267061948, + "loss": 0.0878, + "num_input_tokens_seen": 7777744, + "step": 8600 + }, + { + "epoch": 2.269763758743566, + "eval_loss": 0.11830829828977585, + "eval_runtime": 75.8727, + "eval_samples_per_second": 88.767, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 7777744, + "step": 8600 + }, + { + "epoch": 2.271083542299063, + "grad_norm": 0.0022290325723588467, + "learning_rate": 0.2670350914582918, + "loss": 0.0766, + "num_input_tokens_seen": 7782192, + "step": 8605 + }, + { + "epoch": 2.27240332585456, + "grad_norm": 0.0037155449390411377, + "learning_rate": 0.26699823816210694, + "loss": 0.0838, + "num_input_tokens_seen": 7786864, + "step": 8610 + }, + { + "epoch": 2.2737231094100565, + "grad_norm": 0.0038472486194223166, + "learning_rate": 0.26696136682332344, + "loss": 0.1579, + "num_input_tokens_seen": 7791504, + "step": 8615 + }, + { + "epoch": 2.2750428929655535, + "grad_norm": 0.002777901478111744, + "learning_rate": 0.2669244774476274, + "loss": 0.0732, + "num_input_tokens_seen": 7795984, + "step": 8620 + }, + { + "epoch": 2.2763626765210505, + "grad_norm": 0.004038034938275814, + "learning_rate": 0.2668875700407075, + "loss": 0.0936, + "num_input_tokens_seen": 7800368, + "step": 8625 + }, + { + "epoch": 2.2776824600765475, + "grad_norm": 0.003859520424157381, + "learning_rate": 0.26685064460825547, + "loss": 0.1087, + "num_input_tokens_seen": 7805200, + "step": 8630 + }, + { + "epoch": 2.2790022436320445, + "grad_norm": 0.0015939462464302778, + "learning_rate": 0.26681370115596553, + "loss": 0.0931, + "num_input_tokens_seen": 7809456, + "step": 8635 + }, + { + "epoch": 2.280322027187541, + "grad_norm": 0.00336947082541883, + "learning_rate": 0.26677673968953497, + "loss": 0.147, + "num_input_tokens_seen": 7813808, + "step": 8640 + }, + { + "epoch": 2.281641810743038, + "grad_norm": 0.004314909689128399, + "learning_rate": 0.2667397602146636, + "loss": 0.1121, + "num_input_tokens_seen": 7818096, + "step": 8645 + }, + { + "epoch": 2.282961594298535, + "grad_norm": 0.0035734232515096664, + "learning_rate": 0.2667027627370542, + "loss": 0.115, + "num_input_tokens_seen": 7822480, + "step": 8650 + }, + { + "epoch": 2.284281377854032, + "grad_norm": 0.002229400211945176, + "learning_rate": 0.26666574726241216, + "loss": 0.0856, + "num_input_tokens_seen": 7826864, + "step": 8655 + }, + { + "epoch": 2.285601161409529, + "grad_norm": 0.00409464817494154, + "learning_rate": 0.2666287137964458, + "loss": 0.1287, + "num_input_tokens_seen": 7831376, + "step": 8660 + }, + { + "epoch": 2.2869209449650256, + "grad_norm": 0.0032093017362058163, + "learning_rate": 0.26659166234486614, + "loss": 0.1002, + "num_input_tokens_seen": 7835952, + "step": 8665 + }, + { + "epoch": 2.2882407285205226, + "grad_norm": 0.0025399557780474424, + "learning_rate": 0.2665545929133869, + "loss": 0.1693, + "num_input_tokens_seen": 7840592, + "step": 8670 + }, + { + "epoch": 2.2895605120760196, + "grad_norm": 0.004387828055769205, + "learning_rate": 0.2665175055077248, + "loss": 0.1097, + "num_input_tokens_seen": 7845328, + "step": 8675 + }, + { + "epoch": 2.290880295631516, + "grad_norm": 0.0022213233169168234, + "learning_rate": 0.2664804001335991, + "loss": 0.1014, + "num_input_tokens_seen": 7849936, + "step": 8680 + }, + { + "epoch": 2.292200079187013, + "grad_norm": 0.00221627508290112, + "learning_rate": 0.26644327679673185, + "loss": 0.0965, + "num_input_tokens_seen": 7854160, + "step": 8685 + }, + { + "epoch": 2.29351986274251, + "grad_norm": 0.002957342891022563, + "learning_rate": 0.26640613550284803, + "loss": 0.0879, + "num_input_tokens_seen": 7858800, + "step": 8690 + }, + { + "epoch": 2.294839646298007, + "grad_norm": 0.004222025163471699, + "learning_rate": 0.26636897625767525, + "loss": 0.1128, + "num_input_tokens_seen": 7863504, + "step": 8695 + }, + { + "epoch": 2.296159429853504, + "grad_norm": 0.0028751937206834555, + "learning_rate": 0.266331799066944, + "loss": 0.098, + "num_input_tokens_seen": 7868112, + "step": 8700 + }, + { + "epoch": 2.2974792134090007, + "grad_norm": 0.0033275242894887924, + "learning_rate": 0.2662946039363874, + "loss": 0.0906, + "num_input_tokens_seen": 7872656, + "step": 8705 + }, + { + "epoch": 2.2987989969644977, + "grad_norm": 0.0018308170838281512, + "learning_rate": 0.2662573908717414, + "loss": 0.1106, + "num_input_tokens_seen": 7876976, + "step": 8710 + }, + { + "epoch": 2.3001187805199947, + "grad_norm": 0.004877574276179075, + "learning_rate": 0.2662201598787447, + "loss": 0.1159, + "num_input_tokens_seen": 7881872, + "step": 8715 + }, + { + "epoch": 2.3014385640754917, + "grad_norm": 0.006185438018292189, + "learning_rate": 0.2661829109631389, + "loss": 0.0985, + "num_input_tokens_seen": 7886480, + "step": 8720 + }, + { + "epoch": 2.3027583476309887, + "grad_norm": 0.002627038164064288, + "learning_rate": 0.26614564413066816, + "loss": 0.1153, + "num_input_tokens_seen": 7891184, + "step": 8725 + }, + { + "epoch": 2.3040781311864853, + "grad_norm": 0.0030381078831851482, + "learning_rate": 0.2661083593870795, + "loss": 0.1412, + "num_input_tokens_seen": 7895792, + "step": 8730 + }, + { + "epoch": 2.3053979147419823, + "grad_norm": 0.004649925511330366, + "learning_rate": 0.26607105673812276, + "loss": 0.1246, + "num_input_tokens_seen": 7900432, + "step": 8735 + }, + { + "epoch": 2.3067176982974793, + "grad_norm": 0.0022211316972970963, + "learning_rate": 0.2660337361895504, + "loss": 0.1248, + "num_input_tokens_seen": 7904816, + "step": 8740 + }, + { + "epoch": 2.3080374818529763, + "grad_norm": 0.0016885806107893586, + "learning_rate": 0.26599639774711775, + "loss": 0.0845, + "num_input_tokens_seen": 7909296, + "step": 8745 + }, + { + "epoch": 2.309357265408473, + "grad_norm": 0.0039102197624742985, + "learning_rate": 0.2659590414165829, + "loss": 0.1528, + "num_input_tokens_seen": 7913424, + "step": 8750 + }, + { + "epoch": 2.31067704896397, + "grad_norm": 0.002229146659374237, + "learning_rate": 0.2659216672037066, + "loss": 0.1055, + "num_input_tokens_seen": 7918192, + "step": 8755 + }, + { + "epoch": 2.311996832519467, + "grad_norm": 0.002004374749958515, + "learning_rate": 0.26588427511425244, + "loss": 0.1072, + "num_input_tokens_seen": 7922704, + "step": 8760 + }, + { + "epoch": 2.313316616074964, + "grad_norm": 0.001078281900845468, + "learning_rate": 0.26584686515398676, + "loss": 0.1175, + "num_input_tokens_seen": 7927120, + "step": 8765 + }, + { + "epoch": 2.3146363996304604, + "grad_norm": 0.004028796683996916, + "learning_rate": 0.2658094373286787, + "loss": 0.1218, + "num_input_tokens_seen": 7931440, + "step": 8770 + }, + { + "epoch": 2.3159561831859574, + "grad_norm": 0.002612076234072447, + "learning_rate": 0.2657719916441, + "loss": 0.1179, + "num_input_tokens_seen": 7935472, + "step": 8775 + }, + { + "epoch": 2.3172759667414544, + "grad_norm": 0.0019703886937350035, + "learning_rate": 0.2657345281060253, + "loss": 0.1277, + "num_input_tokens_seen": 7939696, + "step": 8780 + }, + { + "epoch": 2.3185957502969514, + "grad_norm": 0.002447047270834446, + "learning_rate": 0.26569704672023203, + "loss": 0.0874, + "num_input_tokens_seen": 7944272, + "step": 8785 + }, + { + "epoch": 2.3199155338524484, + "grad_norm": 0.004256071988493204, + "learning_rate": 0.26565954749250015, + "loss": 0.0977, + "num_input_tokens_seen": 7948592, + "step": 8790 + }, + { + "epoch": 2.321235317407945, + "grad_norm": 0.004451436921954155, + "learning_rate": 0.2656220304286126, + "loss": 0.1013, + "num_input_tokens_seen": 7952944, + "step": 8795 + }, + { + "epoch": 2.322555100963442, + "grad_norm": 0.003895878093317151, + "learning_rate": 0.265584495534355, + "loss": 0.1417, + "num_input_tokens_seen": 7957552, + "step": 8800 + }, + { + "epoch": 2.322555100963442, + "eval_loss": 0.11305573582649231, + "eval_runtime": 75.8338, + "eval_samples_per_second": 88.813, + "eval_steps_per_second": 22.206, + "num_input_tokens_seen": 7957552, + "step": 8800 + }, + { + "epoch": 2.323874884518939, + "grad_norm": 0.0026837941259145737, + "learning_rate": 0.2655469428155156, + "loss": 0.1149, + "num_input_tokens_seen": 7962256, + "step": 8805 + }, + { + "epoch": 2.325194668074436, + "grad_norm": 0.003086973214522004, + "learning_rate": 0.2655093722778856, + "loss": 0.1208, + "num_input_tokens_seen": 7966800, + "step": 8810 + }, + { + "epoch": 2.3265144516299325, + "grad_norm": 0.002530438592657447, + "learning_rate": 0.2654717839272588, + "loss": 0.0919, + "num_input_tokens_seen": 7971504, + "step": 8815 + }, + { + "epoch": 2.3278342351854295, + "grad_norm": 0.0027032988145947456, + "learning_rate": 0.2654341777694318, + "loss": 0.0964, + "num_input_tokens_seen": 7976144, + "step": 8820 + }, + { + "epoch": 2.3291540187409265, + "grad_norm": 0.0034701216500252485, + "learning_rate": 0.265396553810204, + "loss": 0.1191, + "num_input_tokens_seen": 7980592, + "step": 8825 + }, + { + "epoch": 2.3304738022964235, + "grad_norm": 0.003095222869887948, + "learning_rate": 0.26535891205537737, + "loss": 0.139, + "num_input_tokens_seen": 7984816, + "step": 8830 + }, + { + "epoch": 2.33179358585192, + "grad_norm": 0.00393716711550951, + "learning_rate": 0.26532125251075683, + "loss": 0.1255, + "num_input_tokens_seen": 7989104, + "step": 8835 + }, + { + "epoch": 2.333113369407417, + "grad_norm": 0.0022236336953938007, + "learning_rate": 0.26528357518214996, + "loss": 0.1409, + "num_input_tokens_seen": 7994000, + "step": 8840 + }, + { + "epoch": 2.334433152962914, + "grad_norm": 0.0037768033798784018, + "learning_rate": 0.26524588007536704, + "loss": 0.1104, + "num_input_tokens_seen": 7998448, + "step": 8845 + }, + { + "epoch": 2.335752936518411, + "grad_norm": 0.003936669789254665, + "learning_rate": 0.26520816719622115, + "loss": 0.1367, + "num_input_tokens_seen": 8003024, + "step": 8850 + }, + { + "epoch": 2.337072720073908, + "grad_norm": 0.0036770992446690798, + "learning_rate": 0.2651704365505281, + "loss": 0.1616, + "num_input_tokens_seen": 8007696, + "step": 8855 + }, + { + "epoch": 2.3383925036294047, + "grad_norm": 0.0036172119434922934, + "learning_rate": 0.26513268814410634, + "loss": 0.1149, + "num_input_tokens_seen": 8012112, + "step": 8860 + }, + { + "epoch": 2.3397122871849017, + "grad_norm": 0.0037649988662451506, + "learning_rate": 0.2650949219827773, + "loss": 0.1013, + "num_input_tokens_seen": 8016816, + "step": 8865 + }, + { + "epoch": 2.3410320707403987, + "grad_norm": 0.003251992864534259, + "learning_rate": 0.26505713807236486, + "loss": 0.1163, + "num_input_tokens_seen": 8021488, + "step": 8870 + }, + { + "epoch": 2.3423518542958957, + "grad_norm": 0.003951073158532381, + "learning_rate": 0.26501933641869585, + "loss": 0.1187, + "num_input_tokens_seen": 8025520, + "step": 8875 + }, + { + "epoch": 2.343671637851392, + "grad_norm": 0.005453299731016159, + "learning_rate": 0.26498151702759976, + "loss": 0.1321, + "num_input_tokens_seen": 8029936, + "step": 8880 + }, + { + "epoch": 2.344991421406889, + "grad_norm": 0.0015176471788436174, + "learning_rate": 0.2649436799049088, + "loss": 0.0605, + "num_input_tokens_seen": 8034416, + "step": 8885 + }, + { + "epoch": 2.346311204962386, + "grad_norm": 0.0021251258440315723, + "learning_rate": 0.2649058250564579, + "loss": 0.0632, + "num_input_tokens_seen": 8038800, + "step": 8890 + }, + { + "epoch": 2.347630988517883, + "grad_norm": 0.0021754896733909845, + "learning_rate": 0.26486795248808476, + "loss": 0.0848, + "num_input_tokens_seen": 8043504, + "step": 8895 + }, + { + "epoch": 2.3489507720733798, + "grad_norm": 0.0021375943906605244, + "learning_rate": 0.2648300622056298, + "loss": 0.0767, + "num_input_tokens_seen": 8047984, + "step": 8900 + }, + { + "epoch": 2.3502705556288768, + "grad_norm": 0.0011918239761143923, + "learning_rate": 0.2647921542149363, + "loss": 0.1026, + "num_input_tokens_seen": 8052272, + "step": 8905 + }, + { + "epoch": 2.3515903391843738, + "grad_norm": 0.002330426825210452, + "learning_rate": 0.26475422852185, + "loss": 0.0846, + "num_input_tokens_seen": 8056848, + "step": 8910 + }, + { + "epoch": 2.3529101227398708, + "grad_norm": 0.0038832719437777996, + "learning_rate": 0.2647162851322196, + "loss": 0.1271, + "num_input_tokens_seen": 8061424, + "step": 8915 + }, + { + "epoch": 2.3542299062953678, + "grad_norm": 0.003423130139708519, + "learning_rate": 0.2646783240518964, + "loss": 0.1504, + "num_input_tokens_seen": 8066128, + "step": 8920 + }, + { + "epoch": 2.3555496898508643, + "grad_norm": 0.002355910139158368, + "learning_rate": 0.26464034528673447, + "loss": 0.0957, + "num_input_tokens_seen": 8070864, + "step": 8925 + }, + { + "epoch": 2.3568694734063613, + "grad_norm": 0.004047430586069822, + "learning_rate": 0.26460234884259065, + "loss": 0.1182, + "num_input_tokens_seen": 8075440, + "step": 8930 + }, + { + "epoch": 2.3581892569618583, + "grad_norm": 0.003153266618028283, + "learning_rate": 0.2645643347253245, + "loss": 0.1033, + "num_input_tokens_seen": 8079920, + "step": 8935 + }, + { + "epoch": 2.3595090405173553, + "grad_norm": 0.0029230534564703703, + "learning_rate": 0.2645263029407982, + "loss": 0.0863, + "num_input_tokens_seen": 8084624, + "step": 8940 + }, + { + "epoch": 2.360828824072852, + "grad_norm": 0.002021418185904622, + "learning_rate": 0.2644882534948767, + "loss": 0.1521, + "num_input_tokens_seen": 8089008, + "step": 8945 + }, + { + "epoch": 2.362148607628349, + "grad_norm": 0.0028103061486035585, + "learning_rate": 0.2644501863934278, + "loss": 0.1127, + "num_input_tokens_seen": 8093648, + "step": 8950 + }, + { + "epoch": 2.363468391183846, + "grad_norm": 0.002568298950791359, + "learning_rate": 0.26441210164232193, + "loss": 0.1468, + "num_input_tokens_seen": 8098096, + "step": 8955 + }, + { + "epoch": 2.364788174739343, + "grad_norm": 0.002158243441954255, + "learning_rate": 0.26437399924743216, + "loss": 0.1051, + "num_input_tokens_seen": 8102576, + "step": 8960 + }, + { + "epoch": 2.3661079582948394, + "grad_norm": 0.001517103984951973, + "learning_rate": 0.26433587921463436, + "loss": 0.0928, + "num_input_tokens_seen": 8106960, + "step": 8965 + }, + { + "epoch": 2.3674277418503364, + "grad_norm": 0.0030290272552520037, + "learning_rate": 0.2642977415498072, + "loss": 0.1453, + "num_input_tokens_seen": 8111280, + "step": 8970 + }, + { + "epoch": 2.3687475254058334, + "grad_norm": 0.0018874683883041143, + "learning_rate": 0.26425958625883195, + "loss": 0.0908, + "num_input_tokens_seen": 8116112, + "step": 8975 + }, + { + "epoch": 2.3700673089613304, + "grad_norm": 0.0022264854051172733, + "learning_rate": 0.2642214133475926, + "loss": 0.0625, + "num_input_tokens_seen": 8120656, + "step": 8980 + }, + { + "epoch": 2.3713870925168274, + "grad_norm": 0.0020379656925797462, + "learning_rate": 0.26418322282197587, + "loss": 0.0779, + "num_input_tokens_seen": 8125040, + "step": 8985 + }, + { + "epoch": 2.372706876072324, + "grad_norm": 0.0028501080814749002, + "learning_rate": 0.2641450146878714, + "loss": 0.1088, + "num_input_tokens_seen": 8129328, + "step": 8990 + }, + { + "epoch": 2.374026659627821, + "grad_norm": 0.00538316834717989, + "learning_rate": 0.26410678895117107, + "loss": 0.1199, + "num_input_tokens_seen": 8133872, + "step": 8995 + }, + { + "epoch": 2.375346443183318, + "grad_norm": 0.0023783843498677015, + "learning_rate": 0.26406854561777, + "loss": 0.0817, + "num_input_tokens_seen": 8138448, + "step": 9000 + }, + { + "epoch": 2.375346443183318, + "eval_loss": 0.1092529147863388, + "eval_runtime": 75.9109, + "eval_samples_per_second": 88.722, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 8138448, + "step": 9000 + }, + { + "epoch": 2.376666226738815, + "grad_norm": 0.0038925930857658386, + "learning_rate": 0.26403028469356576, + "loss": 0.0794, + "num_input_tokens_seen": 8142992, + "step": 9005 + }, + { + "epoch": 2.3779860102943116, + "grad_norm": 0.0018953671678900719, + "learning_rate": 0.2639920061844585, + "loss": 0.0901, + "num_input_tokens_seen": 8147376, + "step": 9010 + }, + { + "epoch": 2.3793057938498086, + "grad_norm": 0.003981275483965874, + "learning_rate": 0.2639537100963515, + "loss": 0.0955, + "num_input_tokens_seen": 8152112, + "step": 9015 + }, + { + "epoch": 2.3806255774053056, + "grad_norm": 0.002415246097370982, + "learning_rate": 0.26391539643515033, + "loss": 0.101, + "num_input_tokens_seen": 8156560, + "step": 9020 + }, + { + "epoch": 2.3819453609608026, + "grad_norm": 0.0032596210949122906, + "learning_rate": 0.26387706520676346, + "loss": 0.1244, + "num_input_tokens_seen": 8160976, + "step": 9025 + }, + { + "epoch": 2.383265144516299, + "grad_norm": 0.0044893017038702965, + "learning_rate": 0.26383871641710205, + "loss": 0.0938, + "num_input_tokens_seen": 8165104, + "step": 9030 + }, + { + "epoch": 2.384584928071796, + "grad_norm": 0.0021944246254861355, + "learning_rate": 0.26380035007208, + "loss": 0.0852, + "num_input_tokens_seen": 8169712, + "step": 9035 + }, + { + "epoch": 2.385904711627293, + "grad_norm": 0.0024492652155458927, + "learning_rate": 0.26376196617761394, + "loss": 0.1087, + "num_input_tokens_seen": 8174096, + "step": 9040 + }, + { + "epoch": 2.38722449518279, + "grad_norm": 0.002009778283536434, + "learning_rate": 0.263723564739623, + "loss": 0.0919, + "num_input_tokens_seen": 8178544, + "step": 9045 + }, + { + "epoch": 2.388544278738287, + "grad_norm": 0.003966021351516247, + "learning_rate": 0.2636851457640293, + "loss": 0.1147, + "num_input_tokens_seen": 8182960, + "step": 9050 + }, + { + "epoch": 2.3898640622937837, + "grad_norm": 0.0036941422149538994, + "learning_rate": 0.26364670925675737, + "loss": 0.1068, + "num_input_tokens_seen": 8187600, + "step": 9055 + }, + { + "epoch": 2.3911838458492807, + "grad_norm": 0.0030478681437671185, + "learning_rate": 0.2636082552237347, + "loss": 0.118, + "num_input_tokens_seen": 8191728, + "step": 9060 + }, + { + "epoch": 2.3925036294047777, + "grad_norm": 0.0034817273262888193, + "learning_rate": 0.26356978367089146, + "loss": 0.1179, + "num_input_tokens_seen": 8196240, + "step": 9065 + }, + { + "epoch": 2.3938234129602747, + "grad_norm": 0.0030906694009900093, + "learning_rate": 0.26353129460416036, + "loss": 0.0921, + "num_input_tokens_seen": 8200912, + "step": 9070 + }, + { + "epoch": 2.3951431965157712, + "grad_norm": 0.002270154654979706, + "learning_rate": 0.2634927880294769, + "loss": 0.1001, + "num_input_tokens_seen": 8205520, + "step": 9075 + }, + { + "epoch": 2.3964629800712682, + "grad_norm": 0.003992791287600994, + "learning_rate": 0.26345426395277927, + "loss": 0.1455, + "num_input_tokens_seen": 8210064, + "step": 9080 + }, + { + "epoch": 2.3977827636267652, + "grad_norm": 0.0028252394404262304, + "learning_rate": 0.2634157223800084, + "loss": 0.1024, + "num_input_tokens_seen": 8214672, + "step": 9085 + }, + { + "epoch": 2.3991025471822622, + "grad_norm": 0.0031174104660749435, + "learning_rate": 0.26337716331710787, + "loss": 0.097, + "num_input_tokens_seen": 8219216, + "step": 9090 + }, + { + "epoch": 2.400422330737759, + "grad_norm": 0.0027147335931658745, + "learning_rate": 0.2633385867700239, + "loss": 0.1017, + "num_input_tokens_seen": 8223664, + "step": 9095 + }, + { + "epoch": 2.401742114293256, + "grad_norm": 0.004351634997874498, + "learning_rate": 0.2632999927447056, + "loss": 0.1092, + "num_input_tokens_seen": 8228240, + "step": 9100 + }, + { + "epoch": 2.403061897848753, + "grad_norm": 0.002191523788496852, + "learning_rate": 0.2632613812471046, + "loss": 0.089, + "num_input_tokens_seen": 8232976, + "step": 9105 + }, + { + "epoch": 2.40438168140425, + "grad_norm": 0.004257769323885441, + "learning_rate": 0.2632227522831753, + "loss": 0.1263, + "num_input_tokens_seen": 8238000, + "step": 9110 + }, + { + "epoch": 2.405701464959747, + "grad_norm": 0.0027422537095844746, + "learning_rate": 0.26318410585887475, + "loss": 0.1024, + "num_input_tokens_seen": 8242448, + "step": 9115 + }, + { + "epoch": 2.4070212485152434, + "grad_norm": 0.0019133597379550338, + "learning_rate": 0.2631454419801627, + "loss": 0.0963, + "num_input_tokens_seen": 8247088, + "step": 9120 + }, + { + "epoch": 2.4083410320707404, + "grad_norm": 0.0026185212191194296, + "learning_rate": 0.2631067606530016, + "loss": 0.069, + "num_input_tokens_seen": 8251248, + "step": 9125 + }, + { + "epoch": 2.4096608156262374, + "grad_norm": 0.004027882125228643, + "learning_rate": 0.2630680618833567, + "loss": 0.09, + "num_input_tokens_seen": 8255664, + "step": 9130 + }, + { + "epoch": 2.4109805991817344, + "grad_norm": 0.002170783234760165, + "learning_rate": 0.26302934567719566, + "loss": 0.0778, + "num_input_tokens_seen": 8260528, + "step": 9135 + }, + { + "epoch": 2.412300382737231, + "grad_norm": 0.0023971903137862682, + "learning_rate": 0.2629906120404892, + "loss": 0.1301, + "num_input_tokens_seen": 8265328, + "step": 9140 + }, + { + "epoch": 2.413620166292728, + "grad_norm": 0.002527473494410515, + "learning_rate": 0.26295186097921036, + "loss": 0.1145, + "num_input_tokens_seen": 8270192, + "step": 9145 + }, + { + "epoch": 2.414939949848225, + "grad_norm": 0.0024811094626784325, + "learning_rate": 0.2629130924993351, + "loss": 0.0999, + "num_input_tokens_seen": 8274800, + "step": 9150 + }, + { + "epoch": 2.416259733403722, + "grad_norm": 0.002320249564945698, + "learning_rate": 0.2628743066068421, + "loss": 0.1189, + "num_input_tokens_seen": 8279632, + "step": 9155 + }, + { + "epoch": 2.4175795169592185, + "grad_norm": 0.0033028048928827047, + "learning_rate": 0.26283550330771244, + "loss": 0.128, + "num_input_tokens_seen": 8284304, + "step": 9160 + }, + { + "epoch": 2.4188993005147155, + "grad_norm": 0.0013160300441086292, + "learning_rate": 0.2627966826079303, + "loss": 0.0924, + "num_input_tokens_seen": 8289136, + "step": 9165 + }, + { + "epoch": 2.4202190840702125, + "grad_norm": 0.004113232251256704, + "learning_rate": 0.26275784451348216, + "loss": 0.1404, + "num_input_tokens_seen": 8293808, + "step": 9170 + }, + { + "epoch": 2.4215388676257095, + "grad_norm": 0.001975861843675375, + "learning_rate": 0.2627189890303574, + "loss": 0.0855, + "num_input_tokens_seen": 8298320, + "step": 9175 + }, + { + "epoch": 2.4228586511812065, + "grad_norm": 0.0029244069010019302, + "learning_rate": 0.262680116164548, + "loss": 0.1152, + "num_input_tokens_seen": 8303216, + "step": 9180 + }, + { + "epoch": 2.424178434736703, + "grad_norm": 0.005074602551758289, + "learning_rate": 0.2626412259220487, + "loss": 0.0965, + "num_input_tokens_seen": 8307792, + "step": 9185 + }, + { + "epoch": 2.4254982182922, + "grad_norm": 0.0038657034747302532, + "learning_rate": 0.2626023183088568, + "loss": 0.0925, + "num_input_tokens_seen": 8312240, + "step": 9190 + }, + { + "epoch": 2.426818001847697, + "grad_norm": 0.0017267306102439761, + "learning_rate": 0.26256339333097234, + "loss": 0.0819, + "num_input_tokens_seen": 8316880, + "step": 9195 + }, + { + "epoch": 2.428137785403194, + "grad_norm": 0.004154065158218145, + "learning_rate": 0.2625244509943981, + "loss": 0.0916, + "num_input_tokens_seen": 8321584, + "step": 9200 + }, + { + "epoch": 2.428137785403194, + "eval_loss": 0.111021488904953, + "eval_runtime": 75.8618, + "eval_samples_per_second": 88.78, + "eval_steps_per_second": 22.198, + "num_input_tokens_seen": 8321584, + "step": 9200 + }, + { + "epoch": 2.4294575689586906, + "grad_norm": 0.0030004573054611683, + "learning_rate": 0.2624854913051395, + "loss": 0.0995, + "num_input_tokens_seen": 8326096, + "step": 9205 + }, + { + "epoch": 2.4307773525141876, + "grad_norm": 0.005440327804535627, + "learning_rate": 0.26244651426920446, + "loss": 0.1611, + "num_input_tokens_seen": 8330864, + "step": 9210 + }, + { + "epoch": 2.4320971360696846, + "grad_norm": 0.00365128624252975, + "learning_rate": 0.26240751989260386, + "loss": 0.1354, + "num_input_tokens_seen": 8335280, + "step": 9215 + }, + { + "epoch": 2.4334169196251816, + "grad_norm": 0.004590292926877737, + "learning_rate": 0.2623685081813511, + "loss": 0.166, + "num_input_tokens_seen": 8339824, + "step": 9220 + }, + { + "epoch": 2.434736703180678, + "grad_norm": 0.003134810132905841, + "learning_rate": 0.2623294791414623, + "loss": 0.1476, + "num_input_tokens_seen": 8344272, + "step": 9225 + }, + { + "epoch": 2.436056486736175, + "grad_norm": 0.0018373120110481977, + "learning_rate": 0.26229043277895614, + "loss": 0.1079, + "num_input_tokens_seen": 8348912, + "step": 9230 + }, + { + "epoch": 2.437376270291672, + "grad_norm": 0.0026965655852109194, + "learning_rate": 0.2622513690998542, + "loss": 0.1378, + "num_input_tokens_seen": 8353424, + "step": 9235 + }, + { + "epoch": 2.438696053847169, + "grad_norm": 0.001363061834126711, + "learning_rate": 0.26221228811018044, + "loss": 0.0908, + "num_input_tokens_seen": 8357584, + "step": 9240 + }, + { + "epoch": 2.440015837402666, + "grad_norm": 0.0016819544835016131, + "learning_rate": 0.2621731898159617, + "loss": 0.1032, + "num_input_tokens_seen": 8362032, + "step": 9245 + }, + { + "epoch": 2.4413356209581627, + "grad_norm": 0.003599472576752305, + "learning_rate": 0.26213407422322743, + "loss": 0.1316, + "num_input_tokens_seen": 8366352, + "step": 9250 + }, + { + "epoch": 2.4426554045136597, + "grad_norm": 0.0019022057531401515, + "learning_rate": 0.2620949413380098, + "loss": 0.1241, + "num_input_tokens_seen": 8370896, + "step": 9255 + }, + { + "epoch": 2.4439751880691567, + "grad_norm": 0.002342561725527048, + "learning_rate": 0.26205579116634353, + "loss": 0.1142, + "num_input_tokens_seen": 8375088, + "step": 9260 + }, + { + "epoch": 2.4452949716246537, + "grad_norm": 0.001964505296200514, + "learning_rate": 0.26201662371426604, + "loss": 0.0924, + "num_input_tokens_seen": 8379472, + "step": 9265 + }, + { + "epoch": 2.4466147551801507, + "grad_norm": 0.0016615705098956823, + "learning_rate": 0.2619774389878175, + "loss": 0.0716, + "num_input_tokens_seen": 8384240, + "step": 9270 + }, + { + "epoch": 2.4479345387356473, + "grad_norm": 0.0016512172296643257, + "learning_rate": 0.2619382369930407, + "loss": 0.1078, + "num_input_tokens_seen": 8388752, + "step": 9275 + }, + { + "epoch": 2.4492543222911443, + "grad_norm": 0.00247852667234838, + "learning_rate": 0.261899017735981, + "loss": 0.1079, + "num_input_tokens_seen": 8393424, + "step": 9280 + }, + { + "epoch": 2.4505741058466413, + "grad_norm": 0.003288436681032181, + "learning_rate": 0.2618597812226866, + "loss": 0.1115, + "num_input_tokens_seen": 8397712, + "step": 9285 + }, + { + "epoch": 2.451893889402138, + "grad_norm": 0.002906635869294405, + "learning_rate": 0.2618205274592082, + "loss": 0.1076, + "num_input_tokens_seen": 8402512, + "step": 9290 + }, + { + "epoch": 2.453213672957635, + "grad_norm": 0.0023447212297469378, + "learning_rate": 0.2617812564515992, + "loss": 0.1514, + "num_input_tokens_seen": 8407088, + "step": 9295 + }, + { + "epoch": 2.454533456513132, + "grad_norm": 0.003356894478201866, + "learning_rate": 0.2617419682059158, + "loss": 0.1316, + "num_input_tokens_seen": 8411728, + "step": 9300 + }, + { + "epoch": 2.455853240068629, + "grad_norm": 0.002387541811913252, + "learning_rate": 0.26170266272821663, + "loss": 0.1511, + "num_input_tokens_seen": 8416208, + "step": 9305 + }, + { + "epoch": 2.457173023624126, + "grad_norm": 0.0021186110097914934, + "learning_rate": 0.26166334002456315, + "loss": 0.1125, + "num_input_tokens_seen": 8420656, + "step": 9310 + }, + { + "epoch": 2.4584928071796224, + "grad_norm": 0.003521878505125642, + "learning_rate": 0.2616240001010194, + "loss": 0.1203, + "num_input_tokens_seen": 8425392, + "step": 9315 + }, + { + "epoch": 2.4598125907351194, + "grad_norm": 0.002790649887174368, + "learning_rate": 0.26158464296365197, + "loss": 0.1067, + "num_input_tokens_seen": 8429936, + "step": 9320 + }, + { + "epoch": 2.4611323742906164, + "grad_norm": 0.0021397764794528484, + "learning_rate": 0.2615452686185304, + "loss": 0.0939, + "num_input_tokens_seen": 8434672, + "step": 9325 + }, + { + "epoch": 2.4624521578461134, + "grad_norm": 0.002174742752686143, + "learning_rate": 0.26150587707172673, + "loss": 0.0966, + "num_input_tokens_seen": 8439152, + "step": 9330 + }, + { + "epoch": 2.4637719414016104, + "grad_norm": 0.0025662346743047237, + "learning_rate": 0.2614664683293154, + "loss": 0.1089, + "num_input_tokens_seen": 8443568, + "step": 9335 + }, + { + "epoch": 2.465091724957107, + "grad_norm": 0.0024782398249953985, + "learning_rate": 0.26142704239737397, + "loss": 0.0872, + "num_input_tokens_seen": 8447856, + "step": 9340 + }, + { + "epoch": 2.466411508512604, + "grad_norm": 0.0020788887050002813, + "learning_rate": 0.26138759928198235, + "loss": 0.1032, + "num_input_tokens_seen": 8452112, + "step": 9345 + }, + { + "epoch": 2.467731292068101, + "grad_norm": 0.0014241996686905622, + "learning_rate": 0.26134813898922304, + "loss": 0.1041, + "num_input_tokens_seen": 8456752, + "step": 9350 + }, + { + "epoch": 2.4690510756235975, + "grad_norm": 0.0034955190494656563, + "learning_rate": 0.26130866152518145, + "loss": 0.0838, + "num_input_tokens_seen": 8461232, + "step": 9355 + }, + { + "epoch": 2.4703708591790945, + "grad_norm": 0.002111198613420129, + "learning_rate": 0.2612691668959455, + "loss": 0.0664, + "num_input_tokens_seen": 8465616, + "step": 9360 + }, + { + "epoch": 2.4716906427345915, + "grad_norm": 0.0024082360323518515, + "learning_rate": 0.2612296551076057, + "loss": 0.0873, + "num_input_tokens_seen": 8470416, + "step": 9365 + }, + { + "epoch": 2.4730104262900885, + "grad_norm": 0.0025760107673704624, + "learning_rate": 0.26119012616625525, + "loss": 0.0763, + "num_input_tokens_seen": 8475088, + "step": 9370 + }, + { + "epoch": 2.4743302098455855, + "grad_norm": 0.0026233126409351826, + "learning_rate": 0.26115058007799, + "loss": 0.0711, + "num_input_tokens_seen": 8479504, + "step": 9375 + }, + { + "epoch": 2.475649993401082, + "grad_norm": 0.004143731668591499, + "learning_rate": 0.26111101684890864, + "loss": 0.0996, + "num_input_tokens_seen": 8483824, + "step": 9380 + }, + { + "epoch": 2.476969776956579, + "grad_norm": 0.0038966042920947075, + "learning_rate": 0.26107143648511205, + "loss": 0.0889, + "num_input_tokens_seen": 8488272, + "step": 9385 + }, + { + "epoch": 2.478289560512076, + "grad_norm": 0.001941170310601592, + "learning_rate": 0.2610318389927042, + "loss": 0.0679, + "num_input_tokens_seen": 8492880, + "step": 9390 + }, + { + "epoch": 2.479609344067573, + "grad_norm": 0.0030753626488149166, + "learning_rate": 0.26099222437779146, + "loss": 0.1107, + "num_input_tokens_seen": 8497456, + "step": 9395 + }, + { + "epoch": 2.48092912762307, + "grad_norm": 0.0016328277997672558, + "learning_rate": 0.26095259264648285, + "loss": 0.092, + "num_input_tokens_seen": 8502288, + "step": 9400 + }, + { + "epoch": 2.48092912762307, + "eval_loss": 0.11011672765016556, + "eval_runtime": 75.921, + "eval_samples_per_second": 88.711, + "eval_steps_per_second": 22.181, + "num_input_tokens_seen": 8502288, + "step": 9400 + }, + { + "epoch": 2.4822489111785666, + "grad_norm": 0.004815850406885147, + "learning_rate": 0.2609129438048902, + "loss": 0.1677, + "num_input_tokens_seen": 8506640, + "step": 9405 + }, + { + "epoch": 2.4835686947340636, + "grad_norm": 0.002683349885046482, + "learning_rate": 0.2608732778591278, + "loss": 0.0935, + "num_input_tokens_seen": 8511312, + "step": 9410 + }, + { + "epoch": 2.4848884782895606, + "grad_norm": 0.0026700561866164207, + "learning_rate": 0.2608335948153126, + "loss": 0.1112, + "num_input_tokens_seen": 8515536, + "step": 9415 + }, + { + "epoch": 2.486208261845057, + "grad_norm": 0.0027360671665519476, + "learning_rate": 0.26079389467956426, + "loss": 0.1029, + "num_input_tokens_seen": 8520080, + "step": 9420 + }, + { + "epoch": 2.487528045400554, + "grad_norm": 0.001801011967472732, + "learning_rate": 0.26075417745800505, + "loss": 0.1272, + "num_input_tokens_seen": 8524912, + "step": 9425 + }, + { + "epoch": 2.488847828956051, + "grad_norm": 0.0037803081795573235, + "learning_rate": 0.26071444315675985, + "loss": 0.1255, + "num_input_tokens_seen": 8529456, + "step": 9430 + }, + { + "epoch": 2.490167612511548, + "grad_norm": 0.001212680828757584, + "learning_rate": 0.2606746917819562, + "loss": 0.0916, + "num_input_tokens_seen": 8534224, + "step": 9435 + }, + { + "epoch": 2.491487396067045, + "grad_norm": 0.0020091652404516935, + "learning_rate": 0.2606349233397242, + "loss": 0.1034, + "num_input_tokens_seen": 8538480, + "step": 9440 + }, + { + "epoch": 2.4928071796225417, + "grad_norm": 0.003071355167776346, + "learning_rate": 0.26059513783619676, + "loss": 0.1134, + "num_input_tokens_seen": 8543472, + "step": 9445 + }, + { + "epoch": 2.4941269631780387, + "grad_norm": 0.0029851305298507214, + "learning_rate": 0.26055533527750924, + "loss": 0.083, + "num_input_tokens_seen": 8548112, + "step": 9450 + }, + { + "epoch": 2.4954467467335357, + "grad_norm": 0.002198934555053711, + "learning_rate": 0.26051551566979964, + "loss": 0.0705, + "num_input_tokens_seen": 8552656, + "step": 9455 + }, + { + "epoch": 2.4967665302890327, + "grad_norm": 0.006368421018123627, + "learning_rate": 0.26047567901920876, + "loss": 0.11, + "num_input_tokens_seen": 8557328, + "step": 9460 + }, + { + "epoch": 2.4980863138445297, + "grad_norm": 0.0029004565440118313, + "learning_rate": 0.2604358253318798, + "loss": 0.1082, + "num_input_tokens_seen": 8561616, + "step": 9465 + }, + { + "epoch": 2.4994060974000263, + "grad_norm": 0.0028164552059024572, + "learning_rate": 0.26039595461395876, + "loss": 0.1121, + "num_input_tokens_seen": 8565872, + "step": 9470 + }, + { + "epoch": 2.5007258809555233, + "grad_norm": 0.0031710986513644457, + "learning_rate": 0.26035606687159424, + "loss": 0.081, + "num_input_tokens_seen": 8570288, + "step": 9475 + }, + { + "epoch": 2.5020456645110203, + "grad_norm": 0.0012855075765401125, + "learning_rate": 0.26031616211093733, + "loss": 0.1035, + "num_input_tokens_seen": 8575184, + "step": 9480 + }, + { + "epoch": 2.503365448066517, + "grad_norm": 0.005283057224005461, + "learning_rate": 0.26027624033814195, + "loss": 0.1358, + "num_input_tokens_seen": 8579664, + "step": 9485 + }, + { + "epoch": 2.504685231622014, + "grad_norm": 0.0021302374079823494, + "learning_rate": 0.2602363015593645, + "loss": 0.0776, + "num_input_tokens_seen": 8584112, + "step": 9490 + }, + { + "epoch": 2.506005015177511, + "grad_norm": 0.005627392791211605, + "learning_rate": 0.26019634578076395, + "loss": 0.1319, + "num_input_tokens_seen": 8588560, + "step": 9495 + }, + { + "epoch": 2.507324798733008, + "grad_norm": 0.0013972914312034845, + "learning_rate": 0.26015637300850214, + "loss": 0.0984, + "num_input_tokens_seen": 8592880, + "step": 9500 + }, + { + "epoch": 2.508644582288505, + "grad_norm": 0.002182167489081621, + "learning_rate": 0.26011638324874325, + "loss": 0.092, + "num_input_tokens_seen": 8597424, + "step": 9505 + }, + { + "epoch": 2.5099643658440014, + "grad_norm": 0.0025405101478099823, + "learning_rate": 0.2600763765076543, + "loss": 0.1068, + "num_input_tokens_seen": 8602320, + "step": 9510 + }, + { + "epoch": 2.5112841493994984, + "grad_norm": 0.0036675434093922377, + "learning_rate": 0.2600363527914048, + "loss": 0.1122, + "num_input_tokens_seen": 8606864, + "step": 9515 + }, + { + "epoch": 2.5126039329549954, + "grad_norm": 0.002144621452316642, + "learning_rate": 0.25999631210616686, + "loss": 0.1146, + "num_input_tokens_seen": 8611536, + "step": 9520 + }, + { + "epoch": 2.5139237165104924, + "grad_norm": 0.00278521073050797, + "learning_rate": 0.25995625445811527, + "loss": 0.0764, + "num_input_tokens_seen": 8615952, + "step": 9525 + }, + { + "epoch": 2.5152435000659894, + "grad_norm": 0.002686867956072092, + "learning_rate": 0.2599161798534275, + "loss": 0.1191, + "num_input_tokens_seen": 8620400, + "step": 9530 + }, + { + "epoch": 2.516563283621486, + "grad_norm": 0.0030272146686911583, + "learning_rate": 0.25987608829828346, + "loss": 0.0953, + "num_input_tokens_seen": 8624688, + "step": 9535 + }, + { + "epoch": 2.517883067176983, + "grad_norm": 0.0030819308012723923, + "learning_rate": 0.25983597979886586, + "loss": 0.121, + "num_input_tokens_seen": 8629264, + "step": 9540 + }, + { + "epoch": 2.51920285073248, + "grad_norm": 0.006608471740037203, + "learning_rate": 0.2597958543613599, + "loss": 0.0897, + "num_input_tokens_seen": 8633456, + "step": 9545 + }, + { + "epoch": 2.5205226342879765, + "grad_norm": 0.0032026907429099083, + "learning_rate": 0.25975571199195335, + "loss": 0.1194, + "num_input_tokens_seen": 8638448, + "step": 9550 + }, + { + "epoch": 2.5218424178434735, + "grad_norm": 0.0042789229191839695, + "learning_rate": 0.25971555269683677, + "loss": 0.1113, + "num_input_tokens_seen": 8643184, + "step": 9555 + }, + { + "epoch": 2.5231622013989705, + "grad_norm": 0.002639885526150465, + "learning_rate": 0.25967537648220324, + "loss": 0.1264, + "num_input_tokens_seen": 8647792, + "step": 9560 + }, + { + "epoch": 2.5244819849544675, + "grad_norm": 0.0017792289145290852, + "learning_rate": 0.2596351833542483, + "loss": 0.09, + "num_input_tokens_seen": 8652336, + "step": 9565 + }, + { + "epoch": 2.5258017685099645, + "grad_norm": 0.0030020978301763535, + "learning_rate": 0.25959497331917036, + "loss": 0.1071, + "num_input_tokens_seen": 8656848, + "step": 9570 + }, + { + "epoch": 2.527121552065461, + "grad_norm": 0.003339176531881094, + "learning_rate": 0.2595547463831703, + "loss": 0.1319, + "num_input_tokens_seen": 8661296, + "step": 9575 + }, + { + "epoch": 2.528441335620958, + "grad_norm": 0.0028490275144577026, + "learning_rate": 0.25951450255245156, + "loss": 0.0777, + "num_input_tokens_seen": 8665904, + "step": 9580 + }, + { + "epoch": 2.529761119176455, + "grad_norm": 0.0036648812238126993, + "learning_rate": 0.2594742418332203, + "loss": 0.0939, + "num_input_tokens_seen": 8670576, + "step": 9585 + }, + { + "epoch": 2.531080902731952, + "grad_norm": 0.0032964583951979876, + "learning_rate": 0.2594339642316852, + "loss": 0.132, + "num_input_tokens_seen": 8675280, + "step": 9590 + }, + { + "epoch": 2.532400686287449, + "grad_norm": 0.001654443796724081, + "learning_rate": 0.2593936697540576, + "loss": 0.1096, + "num_input_tokens_seen": 8679536, + "step": 9595 + }, + { + "epoch": 2.5337204698429456, + "grad_norm": 0.002611185424029827, + "learning_rate": 0.2593533584065514, + "loss": 0.0983, + "num_input_tokens_seen": 8684240, + "step": 9600 + }, + { + "epoch": 2.5337204698429456, + "eval_loss": 0.11025646328926086, + "eval_runtime": 75.8011, + "eval_samples_per_second": 88.851, + "eval_steps_per_second": 22.216, + "num_input_tokens_seen": 8684240, + "step": 9600 + }, + { + "epoch": 2.5350402533984426, + "grad_norm": 0.0035427205730229616, + "learning_rate": 0.2593130301953831, + "loss": 0.1125, + "num_input_tokens_seen": 8688656, + "step": 9605 + }, + { + "epoch": 2.5363600369539396, + "grad_norm": 0.002093031769618392, + "learning_rate": 0.2592726851267718, + "loss": 0.0848, + "num_input_tokens_seen": 8693008, + "step": 9610 + }, + { + "epoch": 2.537679820509436, + "grad_norm": 0.002489338396117091, + "learning_rate": 0.2592323232069393, + "loss": 0.1036, + "num_input_tokens_seen": 8697584, + "step": 9615 + }, + { + "epoch": 2.538999604064933, + "grad_norm": 0.002395159797742963, + "learning_rate": 0.25919194444210986, + "loss": 0.1259, + "num_input_tokens_seen": 8701872, + "step": 9620 + }, + { + "epoch": 2.54031938762043, + "grad_norm": 0.0023876912891864777, + "learning_rate": 0.2591515488385103, + "loss": 0.092, + "num_input_tokens_seen": 8706640, + "step": 9625 + }, + { + "epoch": 2.541639171175927, + "grad_norm": 0.0025122775696218014, + "learning_rate": 0.2591111364023704, + "loss": 0.1167, + "num_input_tokens_seen": 8711472, + "step": 9630 + }, + { + "epoch": 2.542958954731424, + "grad_norm": 0.0014959569089114666, + "learning_rate": 0.259070707139922, + "loss": 0.1107, + "num_input_tokens_seen": 8716176, + "step": 9635 + }, + { + "epoch": 2.5442787382869207, + "grad_norm": 0.003051956882700324, + "learning_rate": 0.25903026105739985, + "loss": 0.1115, + "num_input_tokens_seen": 8720720, + "step": 9640 + }, + { + "epoch": 2.5455985218424177, + "grad_norm": 0.003628539154306054, + "learning_rate": 0.2589897981610413, + "loss": 0.1374, + "num_input_tokens_seen": 8724944, + "step": 9645 + }, + { + "epoch": 2.5469183053979148, + "grad_norm": 0.001491388538852334, + "learning_rate": 0.2589493184570863, + "loss": 0.0835, + "num_input_tokens_seen": 8729392, + "step": 9650 + }, + { + "epoch": 2.5482380889534118, + "grad_norm": 0.001894147600978613, + "learning_rate": 0.25890882195177717, + "loss": 0.1066, + "num_input_tokens_seen": 8734224, + "step": 9655 + }, + { + "epoch": 2.5495578725089088, + "grad_norm": 0.0024416942615062, + "learning_rate": 0.25886830865135907, + "loss": 0.0962, + "num_input_tokens_seen": 8738736, + "step": 9660 + }, + { + "epoch": 2.5508776560644053, + "grad_norm": 0.0016548567218706012, + "learning_rate": 0.25882777856207967, + "loss": 0.0929, + "num_input_tokens_seen": 8743408, + "step": 9665 + }, + { + "epoch": 2.5521974396199023, + "grad_norm": 0.0047805472277104855, + "learning_rate": 0.2587872316901892, + "loss": 0.1466, + "num_input_tokens_seen": 8747728, + "step": 9670 + }, + { + "epoch": 2.5535172231753993, + "grad_norm": 0.00283817364834249, + "learning_rate": 0.25874666804194046, + "loss": 0.1343, + "num_input_tokens_seen": 8752112, + "step": 9675 + }, + { + "epoch": 2.554837006730896, + "grad_norm": 0.0032598914112895727, + "learning_rate": 0.258706087623589, + "loss": 0.1059, + "num_input_tokens_seen": 8756688, + "step": 9680 + }, + { + "epoch": 2.556156790286393, + "grad_norm": 0.0015688982093706727, + "learning_rate": 0.25866549044139264, + "loss": 0.1125, + "num_input_tokens_seen": 8761392, + "step": 9685 + }, + { + "epoch": 2.55747657384189, + "grad_norm": 0.001891524763777852, + "learning_rate": 0.25862487650161214, + "loss": 0.0986, + "num_input_tokens_seen": 8765808, + "step": 9690 + }, + { + "epoch": 2.558796357397387, + "grad_norm": 0.00304310186766088, + "learning_rate": 0.2585842458105106, + "loss": 0.1274, + "num_input_tokens_seen": 8770320, + "step": 9695 + }, + { + "epoch": 2.560116140952884, + "grad_norm": 0.0032024283427745104, + "learning_rate": 0.2585435983743538, + "loss": 0.1367, + "num_input_tokens_seen": 8774928, + "step": 9700 + }, + { + "epoch": 2.5614359245083804, + "grad_norm": 0.0030134362168610096, + "learning_rate": 0.2585029341994101, + "loss": 0.084, + "num_input_tokens_seen": 8779536, + "step": 9705 + }, + { + "epoch": 2.5627557080638774, + "grad_norm": 0.00297273276373744, + "learning_rate": 0.2584622532919504, + "loss": 0.1153, + "num_input_tokens_seen": 8784176, + "step": 9710 + }, + { + "epoch": 2.5640754916193744, + "grad_norm": 0.003674419829621911, + "learning_rate": 0.2584215556582482, + "loss": 0.1495, + "num_input_tokens_seen": 8788816, + "step": 9715 + }, + { + "epoch": 2.5653952751748714, + "grad_norm": 0.0016167092835530639, + "learning_rate": 0.25838084130457967, + "loss": 0.098, + "num_input_tokens_seen": 8793552, + "step": 9720 + }, + { + "epoch": 2.5667150587303684, + "grad_norm": 0.0032073608599603176, + "learning_rate": 0.2583401102372234, + "loss": 0.1357, + "num_input_tokens_seen": 8798192, + "step": 9725 + }, + { + "epoch": 2.568034842285865, + "grad_norm": 0.002542046597227454, + "learning_rate": 0.2582993624624606, + "loss": 0.0842, + "num_input_tokens_seen": 8803024, + "step": 9730 + }, + { + "epoch": 2.569354625841362, + "grad_norm": 0.0034888931550085545, + "learning_rate": 0.25825859798657513, + "loss": 0.1222, + "num_input_tokens_seen": 8807376, + "step": 9735 + }, + { + "epoch": 2.570674409396859, + "grad_norm": 0.0032735681161284447, + "learning_rate": 0.25821781681585343, + "loss": 0.1206, + "num_input_tokens_seen": 8811888, + "step": 9740 + }, + { + "epoch": 2.5719941929523555, + "grad_norm": 0.002694379538297653, + "learning_rate": 0.2581770189565844, + "loss": 0.0893, + "num_input_tokens_seen": 8816624, + "step": 9745 + }, + { + "epoch": 2.573313976507853, + "grad_norm": 0.002699043834581971, + "learning_rate": 0.25813620441505963, + "loss": 0.1157, + "num_input_tokens_seen": 8820912, + "step": 9750 + }, + { + "epoch": 2.5746337600633495, + "grad_norm": 0.0021110193338245153, + "learning_rate": 0.2580953731975732, + "loss": 0.1343, + "num_input_tokens_seen": 8825488, + "step": 9755 + }, + { + "epoch": 2.5759535436188465, + "grad_norm": 0.0030692224390804768, + "learning_rate": 0.2580545253104218, + "loss": 0.0925, + "num_input_tokens_seen": 8830256, + "step": 9760 + }, + { + "epoch": 2.5772733271743435, + "grad_norm": 0.001737703918479383, + "learning_rate": 0.2580136607599047, + "loss": 0.1096, + "num_input_tokens_seen": 8834672, + "step": 9765 + }, + { + "epoch": 2.57859311072984, + "grad_norm": 0.0032650448847562075, + "learning_rate": 0.2579727795523238, + "loss": 0.1382, + "num_input_tokens_seen": 8839152, + "step": 9770 + }, + { + "epoch": 2.579912894285337, + "grad_norm": 0.0027515224646776915, + "learning_rate": 0.25793188169398334, + "loss": 0.0916, + "num_input_tokens_seen": 8843408, + "step": 9775 + }, + { + "epoch": 2.581232677840834, + "grad_norm": 0.00279975519515574, + "learning_rate": 0.25789096719119037, + "loss": 0.1023, + "num_input_tokens_seen": 8847920, + "step": 9780 + }, + { + "epoch": 2.582552461396331, + "grad_norm": 0.002205193741247058, + "learning_rate": 0.2578500360502544, + "loss": 0.0997, + "num_input_tokens_seen": 8852368, + "step": 9785 + }, + { + "epoch": 2.583872244951828, + "grad_norm": 0.0024842149578034878, + "learning_rate": 0.2578090882774876, + "loss": 0.0996, + "num_input_tokens_seen": 8856912, + "step": 9790 + }, + { + "epoch": 2.5851920285073247, + "grad_norm": 0.0028995447792112827, + "learning_rate": 0.25776812387920456, + "loss": 0.0883, + "num_input_tokens_seen": 8861552, + "step": 9795 + }, + { + "epoch": 2.5865118120628217, + "grad_norm": 0.0023225441109389067, + "learning_rate": 0.2577271428617225, + "loss": 0.0718, + "num_input_tokens_seen": 8866128, + "step": 9800 + }, + { + "epoch": 2.5865118120628217, + "eval_loss": 0.10902455449104309, + "eval_runtime": 75.8583, + "eval_samples_per_second": 88.784, + "eval_steps_per_second": 22.199, + "num_input_tokens_seen": 8866128, + "step": 9800 + }, + { + "epoch": 2.5878315956183187, + "grad_norm": 0.0026888244319707155, + "learning_rate": 0.25768614523136124, + "loss": 0.0676, + "num_input_tokens_seen": 8870448, + "step": 9805 + }, + { + "epoch": 2.5891513791738157, + "grad_norm": 0.0033979250583797693, + "learning_rate": 0.25764513099444314, + "loss": 0.1193, + "num_input_tokens_seen": 8874992, + "step": 9810 + }, + { + "epoch": 2.5904711627293127, + "grad_norm": 0.003046333324164152, + "learning_rate": 0.25760410015729307, + "loss": 0.1132, + "num_input_tokens_seen": 8879440, + "step": 9815 + }, + { + "epoch": 2.591790946284809, + "grad_norm": 0.004813942592591047, + "learning_rate": 0.2575630527262385, + "loss": 0.1399, + "num_input_tokens_seen": 8883984, + "step": 9820 + }, + { + "epoch": 2.593110729840306, + "grad_norm": 0.004261593334376812, + "learning_rate": 0.25752198870760945, + "loss": 0.1114, + "num_input_tokens_seen": 8888560, + "step": 9825 + }, + { + "epoch": 2.594430513395803, + "grad_norm": 0.001755793928168714, + "learning_rate": 0.2574809081077386, + "loss": 0.0928, + "num_input_tokens_seen": 8893136, + "step": 9830 + }, + { + "epoch": 2.5957502969512998, + "grad_norm": 0.0021201041527092457, + "learning_rate": 0.257439810932961, + "loss": 0.0985, + "num_input_tokens_seen": 8897456, + "step": 9835 + }, + { + "epoch": 2.5970700805067968, + "grad_norm": 0.003177198814228177, + "learning_rate": 0.2573986971896144, + "loss": 0.1562, + "num_input_tokens_seen": 8901968, + "step": 9840 + }, + { + "epoch": 2.5983898640622938, + "grad_norm": 0.0019396186107769608, + "learning_rate": 0.257357566884039, + "loss": 0.1188, + "num_input_tokens_seen": 8906800, + "step": 9845 + }, + { + "epoch": 2.5997096476177908, + "grad_norm": 0.0031701172702014446, + "learning_rate": 0.25731642002257765, + "loss": 0.1154, + "num_input_tokens_seen": 8911216, + "step": 9850 + }, + { + "epoch": 2.6010294311732878, + "grad_norm": 0.0025697730015963316, + "learning_rate": 0.25727525661157574, + "loss": 0.1075, + "num_input_tokens_seen": 8915568, + "step": 9855 + }, + { + "epoch": 2.6023492147287843, + "grad_norm": 0.0021685620304197073, + "learning_rate": 0.2572340766573811, + "loss": 0.1176, + "num_input_tokens_seen": 8920016, + "step": 9860 + }, + { + "epoch": 2.6036689982842813, + "grad_norm": 0.0012447601184248924, + "learning_rate": 0.25719288016634434, + "loss": 0.0951, + "num_input_tokens_seen": 8924432, + "step": 9865 + }, + { + "epoch": 2.6049887818397783, + "grad_norm": 0.0028330455534160137, + "learning_rate": 0.25715166714481835, + "loss": 0.1021, + "num_input_tokens_seen": 8929040, + "step": 9870 + }, + { + "epoch": 2.6063085653952753, + "grad_norm": 0.004101494327187538, + "learning_rate": 0.2571104375991587, + "loss": 0.1187, + "num_input_tokens_seen": 8933680, + "step": 9875 + }, + { + "epoch": 2.6076283489507723, + "grad_norm": 0.002959647448733449, + "learning_rate": 0.2570691915357236, + "loss": 0.0849, + "num_input_tokens_seen": 8938384, + "step": 9880 + }, + { + "epoch": 2.608948132506269, + "grad_norm": 0.0025117890909314156, + "learning_rate": 0.2570279289608736, + "loss": 0.0998, + "num_input_tokens_seen": 8942832, + "step": 9885 + }, + { + "epoch": 2.610267916061766, + "grad_norm": 0.003034894587472081, + "learning_rate": 0.256986649880972, + "loss": 0.1133, + "num_input_tokens_seen": 8947472, + "step": 9890 + }, + { + "epoch": 2.611587699617263, + "grad_norm": 0.003632250474765897, + "learning_rate": 0.25694535430238447, + "loss": 0.1025, + "num_input_tokens_seen": 8951888, + "step": 9895 + }, + { + "epoch": 2.6129074831727594, + "grad_norm": 0.0026296426076442003, + "learning_rate": 0.25690404223147933, + "loss": 0.1264, + "num_input_tokens_seen": 8956400, + "step": 9900 + }, + { + "epoch": 2.6142272667282564, + "grad_norm": 0.0028793038800358772, + "learning_rate": 0.2568627136746275, + "loss": 0.0957, + "num_input_tokens_seen": 8960656, + "step": 9905 + }, + { + "epoch": 2.6155470502837534, + "grad_norm": 0.0024028990883380175, + "learning_rate": 0.25682136863820226, + "loss": 0.1028, + "num_input_tokens_seen": 8964944, + "step": 9910 + }, + { + "epoch": 2.6168668338392505, + "grad_norm": 0.003196108154952526, + "learning_rate": 0.25678000712857957, + "loss": 0.1121, + "num_input_tokens_seen": 8969520, + "step": 9915 + }, + { + "epoch": 2.6181866173947475, + "grad_norm": 0.002093435497954488, + "learning_rate": 0.2567386291521379, + "loss": 0.14, + "num_input_tokens_seen": 8973904, + "step": 9920 + }, + { + "epoch": 2.619506400950244, + "grad_norm": 0.0024769266601651907, + "learning_rate": 0.2566972347152583, + "loss": 0.0593, + "num_input_tokens_seen": 8978416, + "step": 9925 + }, + { + "epoch": 2.620826184505741, + "grad_norm": 0.003420172957703471, + "learning_rate": 0.2566558238243242, + "loss": 0.0919, + "num_input_tokens_seen": 8982896, + "step": 9930 + }, + { + "epoch": 2.622145968061238, + "grad_norm": 0.0016292709624394774, + "learning_rate": 0.25661439648572176, + "loss": 0.0784, + "num_input_tokens_seen": 8987120, + "step": 9935 + }, + { + "epoch": 2.623465751616735, + "grad_norm": 0.0027290668804198503, + "learning_rate": 0.25657295270583963, + "loss": 0.0772, + "num_input_tokens_seen": 8991472, + "step": 9940 + }, + { + "epoch": 2.624785535172232, + "grad_norm": 0.0028781017754226923, + "learning_rate": 0.25653149249106894, + "loss": 0.0934, + "num_input_tokens_seen": 8995984, + "step": 9945 + }, + { + "epoch": 2.6261053187277286, + "grad_norm": 0.004463556222617626, + "learning_rate": 0.25649001584780323, + "loss": 0.0994, + "num_input_tokens_seen": 9000656, + "step": 9950 + }, + { + "epoch": 2.6274251022832256, + "grad_norm": 0.0028075193986296654, + "learning_rate": 0.2564485227824389, + "loss": 0.1134, + "num_input_tokens_seen": 9005168, + "step": 9955 + }, + { + "epoch": 2.6287448858387226, + "grad_norm": 0.0048936414532363415, + "learning_rate": 0.25640701330137466, + "loss": 0.1593, + "num_input_tokens_seen": 9009520, + "step": 9960 + }, + { + "epoch": 2.630064669394219, + "grad_norm": 0.0032766954973340034, + "learning_rate": 0.2563654874110117, + "loss": 0.0987, + "num_input_tokens_seen": 9014128, + "step": 9965 + }, + { + "epoch": 2.631384452949716, + "grad_norm": 0.004342743661254644, + "learning_rate": 0.256323945117754, + "loss": 0.1198, + "num_input_tokens_seen": 9018544, + "step": 9970 + }, + { + "epoch": 2.632704236505213, + "grad_norm": 0.0023017884232103825, + "learning_rate": 0.2562823864280078, + "loss": 0.1069, + "num_input_tokens_seen": 9023216, + "step": 9975 + }, + { + "epoch": 2.63402402006071, + "grad_norm": 0.0014424800174310803, + "learning_rate": 0.25624081134818194, + "loss": 0.1222, + "num_input_tokens_seen": 9027760, + "step": 9980 + }, + { + "epoch": 2.635343803616207, + "grad_norm": 0.0017996751703321934, + "learning_rate": 0.2561992198846879, + "loss": 0.1078, + "num_input_tokens_seen": 9032048, + "step": 9985 + }, + { + "epoch": 2.6366635871717037, + "grad_norm": 0.0037849375512450933, + "learning_rate": 0.25615761204393955, + "loss": 0.1374, + "num_input_tokens_seen": 9036784, + "step": 9990 + }, + { + "epoch": 2.6379833707272007, + "grad_norm": 0.00474481051787734, + "learning_rate": 0.2561159878323534, + "loss": 0.1344, + "num_input_tokens_seen": 9041168, + "step": 9995 + }, + { + "epoch": 2.6393031542826977, + "grad_norm": 0.003318597562611103, + "learning_rate": 0.2560743472563483, + "loss": 0.1192, + "num_input_tokens_seen": 9045584, + "step": 10000 + }, + { + "epoch": 2.6393031542826977, + "eval_loss": 0.12156959623098373, + "eval_runtime": 75.8018, + "eval_samples_per_second": 88.85, + "eval_steps_per_second": 22.216, + "num_input_tokens_seen": 9045584, + "step": 10000 + }, + { + "epoch": 2.6406229378381947, + "grad_norm": 0.0017134805675595999, + "learning_rate": 0.25603269032234593, + "loss": 0.1064, + "num_input_tokens_seen": 9050064, + "step": 10005 + }, + { + "epoch": 2.6419427213936917, + "grad_norm": 0.0015819421969354153, + "learning_rate": 0.2559910170367702, + "loss": 0.0933, + "num_input_tokens_seen": 9054160, + "step": 10010 + }, + { + "epoch": 2.6432625049491882, + "grad_norm": 0.002527151955291629, + "learning_rate": 0.2559493274060477, + "loss": 0.1418, + "num_input_tokens_seen": 9058512, + "step": 10015 + }, + { + "epoch": 2.6445822885046852, + "grad_norm": 0.0018900760915130377, + "learning_rate": 0.2559076214366074, + "loss": 0.1033, + "num_input_tokens_seen": 9063056, + "step": 10020 + }, + { + "epoch": 2.6459020720601822, + "grad_norm": 0.0033547081984579563, + "learning_rate": 0.25586589913488106, + "loss": 0.1, + "num_input_tokens_seen": 9067728, + "step": 10025 + }, + { + "epoch": 2.647221855615679, + "grad_norm": 0.00152048678137362, + "learning_rate": 0.2558241605073026, + "loss": 0.0701, + "num_input_tokens_seen": 9071952, + "step": 10030 + }, + { + "epoch": 2.648541639171176, + "grad_norm": 0.002359438454732299, + "learning_rate": 0.25578240556030873, + "loss": 0.1272, + "num_input_tokens_seen": 9076144, + "step": 10035 + }, + { + "epoch": 2.649861422726673, + "grad_norm": 0.003420009044930339, + "learning_rate": 0.2557406343003386, + "loss": 0.1304, + "num_input_tokens_seen": 9080464, + "step": 10040 + }, + { + "epoch": 2.65118120628217, + "grad_norm": 0.003035488538444042, + "learning_rate": 0.25569884673383375, + "loss": 0.0794, + "num_input_tokens_seen": 9084656, + "step": 10045 + }, + { + "epoch": 2.652500989837667, + "grad_norm": 0.003295755246654153, + "learning_rate": 0.25565704286723856, + "loss": 0.1012, + "num_input_tokens_seen": 9089168, + "step": 10050 + }, + { + "epoch": 2.6538207733931634, + "grad_norm": 0.0029080414678901434, + "learning_rate": 0.25561522270699955, + "loss": 0.0817, + "num_input_tokens_seen": 9094032, + "step": 10055 + }, + { + "epoch": 2.6551405569486604, + "grad_norm": 0.0024505620822310448, + "learning_rate": 0.25557338625956594, + "loss": 0.0897, + "num_input_tokens_seen": 9098768, + "step": 10060 + }, + { + "epoch": 2.6564603405041574, + "grad_norm": 0.0015221221838146448, + "learning_rate": 0.25553153353138947, + "loss": 0.0877, + "num_input_tokens_seen": 9103408, + "step": 10065 + }, + { + "epoch": 2.6577801240596544, + "grad_norm": 0.001995926257222891, + "learning_rate": 0.2554896645289243, + "loss": 0.0654, + "num_input_tokens_seen": 9108048, + "step": 10070 + }, + { + "epoch": 2.6590999076151514, + "grad_norm": 0.0016599702648818493, + "learning_rate": 0.2554477792586272, + "loss": 0.1017, + "num_input_tokens_seen": 9112720, + "step": 10075 + }, + { + "epoch": 2.660419691170648, + "grad_norm": 0.004128627944737673, + "learning_rate": 0.25540587772695744, + "loss": 0.112, + "num_input_tokens_seen": 9117456, + "step": 10080 + }, + { + "epoch": 2.661739474726145, + "grad_norm": 0.001642607501707971, + "learning_rate": 0.2553639599403767, + "loss": 0.0984, + "num_input_tokens_seen": 9121936, + "step": 10085 + }, + { + "epoch": 2.663059258281642, + "grad_norm": 0.001922939089126885, + "learning_rate": 0.2553220259053493, + "loss": 0.1265, + "num_input_tokens_seen": 9126288, + "step": 10090 + }, + { + "epoch": 2.6643790418371385, + "grad_norm": 0.002193618565797806, + "learning_rate": 0.2552800756283419, + "loss": 0.1327, + "num_input_tokens_seen": 9130608, + "step": 10095 + }, + { + "epoch": 2.6656988253926355, + "grad_norm": 0.0020394441671669483, + "learning_rate": 0.25523810911582373, + "loss": 0.1128, + "num_input_tokens_seen": 9134896, + "step": 10100 + }, + { + "epoch": 2.6670186089481325, + "grad_norm": 0.0021859128028154373, + "learning_rate": 0.25519612637426675, + "loss": 0.0847, + "num_input_tokens_seen": 9139408, + "step": 10105 + }, + { + "epoch": 2.6683383925036295, + "grad_norm": 0.0016997227212414145, + "learning_rate": 0.25515412741014504, + "loss": 0.0974, + "num_input_tokens_seen": 9143920, + "step": 10110 + }, + { + "epoch": 2.6696581760591265, + "grad_norm": 0.003805549116805196, + "learning_rate": 0.2551121122299355, + "loss": 0.1339, + "num_input_tokens_seen": 9148464, + "step": 10115 + }, + { + "epoch": 2.670977959614623, + "grad_norm": 0.0036419741809368134, + "learning_rate": 0.2550700808401173, + "loss": 0.0856, + "num_input_tokens_seen": 9153168, + "step": 10120 + }, + { + "epoch": 2.67229774317012, + "grad_norm": 0.0013337506679818034, + "learning_rate": 0.2550280332471722, + "loss": 0.0771, + "num_input_tokens_seen": 9157712, + "step": 10125 + }, + { + "epoch": 2.673617526725617, + "grad_norm": 0.003356347093358636, + "learning_rate": 0.2549859694575845, + "loss": 0.12, + "num_input_tokens_seen": 9162352, + "step": 10130 + }, + { + "epoch": 2.674937310281114, + "grad_norm": 0.003115349914878607, + "learning_rate": 0.254943889477841, + "loss": 0.12, + "num_input_tokens_seen": 9167056, + "step": 10135 + }, + { + "epoch": 2.676257093836611, + "grad_norm": 0.003514957847073674, + "learning_rate": 0.25490179331443097, + "loss": 0.0893, + "num_input_tokens_seen": 9171824, + "step": 10140 + }, + { + "epoch": 2.6775768773921076, + "grad_norm": 0.001449872856028378, + "learning_rate": 0.25485968097384615, + "loss": 0.0841, + "num_input_tokens_seen": 9176400, + "step": 10145 + }, + { + "epoch": 2.6788966609476046, + "grad_norm": 0.0032306089997291565, + "learning_rate": 0.25481755246258075, + "loss": 0.1223, + "num_input_tokens_seen": 9180944, + "step": 10150 + }, + { + "epoch": 2.6802164445031016, + "grad_norm": 0.0044292570091784, + "learning_rate": 0.2547754077871315, + "loss": 0.1074, + "num_input_tokens_seen": 9185328, + "step": 10155 + }, + { + "epoch": 2.681536228058598, + "grad_norm": 0.003025664482265711, + "learning_rate": 0.25473324695399774, + "loss": 0.1222, + "num_input_tokens_seen": 9189712, + "step": 10160 + }, + { + "epoch": 2.682856011614095, + "grad_norm": 0.0036635003052651882, + "learning_rate": 0.25469106996968105, + "loss": 0.0916, + "num_input_tokens_seen": 9194704, + "step": 10165 + }, + { + "epoch": 2.684175795169592, + "grad_norm": 0.0026834532618522644, + "learning_rate": 0.2546488768406858, + "loss": 0.0829, + "num_input_tokens_seen": 9198960, + "step": 10170 + }, + { + "epoch": 2.685495578725089, + "grad_norm": 0.002246422925963998, + "learning_rate": 0.25460666757351863, + "loss": 0.0853, + "num_input_tokens_seen": 9203440, + "step": 10175 + }, + { + "epoch": 2.686815362280586, + "grad_norm": 0.0016155753983184695, + "learning_rate": 0.25456444217468877, + "loss": 0.0894, + "num_input_tokens_seen": 9207952, + "step": 10180 + }, + { + "epoch": 2.6881351458360827, + "grad_norm": 0.0028734018560498953, + "learning_rate": 0.25452220065070785, + "loss": 0.1053, + "num_input_tokens_seen": 9212048, + "step": 10185 + }, + { + "epoch": 2.6894549293915797, + "grad_norm": 0.003979251254349947, + "learning_rate": 0.2544799430080901, + "loss": 0.0739, + "num_input_tokens_seen": 9216720, + "step": 10190 + }, + { + "epoch": 2.6907747129470767, + "grad_norm": 0.003316909074783325, + "learning_rate": 0.2544376692533522, + "loss": 0.1439, + "num_input_tokens_seen": 9221008, + "step": 10195 + }, + { + "epoch": 2.6920944965025737, + "grad_norm": 0.004051956348121166, + "learning_rate": 0.2543953793930132, + "loss": 0.1236, + "num_input_tokens_seen": 9225744, + "step": 10200 + }, + { + "epoch": 2.6920944965025737, + "eval_loss": 0.10960737615823746, + "eval_runtime": 75.8883, + "eval_samples_per_second": 88.749, + "eval_steps_per_second": 22.191, + "num_input_tokens_seen": 9225744, + "step": 10200 + }, + { + "epoch": 2.6934142800580707, + "grad_norm": 0.0016066033858805895, + "learning_rate": 0.2543530734335948, + "loss": 0.1178, + "num_input_tokens_seen": 9229872, + "step": 10205 + }, + { + "epoch": 2.6947340636135673, + "grad_norm": 0.002514080610126257, + "learning_rate": 0.2543107513816211, + "loss": 0.0779, + "num_input_tokens_seen": 9234672, + "step": 10210 + }, + { + "epoch": 2.6960538471690643, + "grad_norm": 0.0027978892903774977, + "learning_rate": 0.25426841324361865, + "loss": 0.1152, + "num_input_tokens_seen": 9239184, + "step": 10215 + }, + { + "epoch": 2.6973736307245613, + "grad_norm": 0.0014565063174813986, + "learning_rate": 0.2542260590261166, + "loss": 0.0578, + "num_input_tokens_seen": 9243856, + "step": 10220 + }, + { + "epoch": 2.698693414280058, + "grad_norm": 0.0017497459193691611, + "learning_rate": 0.2541836887356465, + "loss": 0.0827, + "num_input_tokens_seen": 9248368, + "step": 10225 + }, + { + "epoch": 2.700013197835555, + "grad_norm": 0.0047383494675159454, + "learning_rate": 0.2541413023787423, + "loss": 0.1191, + "num_input_tokens_seen": 9253008, + "step": 10230 + }, + { + "epoch": 2.701332981391052, + "grad_norm": 0.0035653216764330864, + "learning_rate": 0.2540988999619405, + "loss": 0.1003, + "num_input_tokens_seen": 9257872, + "step": 10235 + }, + { + "epoch": 2.702652764946549, + "grad_norm": 0.0022661774419248104, + "learning_rate": 0.25405648149178023, + "loss": 0.0862, + "num_input_tokens_seen": 9262704, + "step": 10240 + }, + { + "epoch": 2.703972548502046, + "grad_norm": 0.0034137519542127848, + "learning_rate": 0.2540140469748028, + "loss": 0.1113, + "num_input_tokens_seen": 9267472, + "step": 10245 + }, + { + "epoch": 2.7052923320575424, + "grad_norm": 0.0030569937080144882, + "learning_rate": 0.25397159641755224, + "loss": 0.1111, + "num_input_tokens_seen": 9272208, + "step": 10250 + }, + { + "epoch": 2.7066121156130394, + "grad_norm": 0.001594472210854292, + "learning_rate": 0.2539291298265749, + "loss": 0.0753, + "num_input_tokens_seen": 9276848, + "step": 10255 + }, + { + "epoch": 2.7079318991685364, + "grad_norm": 0.0025858485605567694, + "learning_rate": 0.2538866472084197, + "loss": 0.1079, + "num_input_tokens_seen": 9281232, + "step": 10260 + }, + { + "epoch": 2.7092516827240334, + "grad_norm": 0.002924407832324505, + "learning_rate": 0.25384414856963794, + "loss": 0.1264, + "num_input_tokens_seen": 9285968, + "step": 10265 + }, + { + "epoch": 2.7105714662795304, + "grad_norm": 0.003977527376264334, + "learning_rate": 0.25380163391678356, + "loss": 0.172, + "num_input_tokens_seen": 9290480, + "step": 10270 + }, + { + "epoch": 2.711891249835027, + "grad_norm": 0.0033079287968575954, + "learning_rate": 0.2537591032564127, + "loss": 0.125, + "num_input_tokens_seen": 9295120, + "step": 10275 + }, + { + "epoch": 2.713211033390524, + "grad_norm": 0.0037616679910570383, + "learning_rate": 0.25371655659508424, + "loss": 0.1261, + "num_input_tokens_seen": 9299952, + "step": 10280 + }, + { + "epoch": 2.714530816946021, + "grad_norm": 0.001961167436093092, + "learning_rate": 0.25367399393935935, + "loss": 0.1156, + "num_input_tokens_seen": 9304720, + "step": 10285 + }, + { + "epoch": 2.7158506005015175, + "grad_norm": 0.0019744113087654114, + "learning_rate": 0.25363141529580174, + "loss": 0.1028, + "num_input_tokens_seen": 9309168, + "step": 10290 + }, + { + "epoch": 2.7171703840570145, + "grad_norm": 0.0019694846123456955, + "learning_rate": 0.2535888206709776, + "loss": 0.078, + "num_input_tokens_seen": 9313904, + "step": 10295 + }, + { + "epoch": 2.7184901676125115, + "grad_norm": 0.002221273723989725, + "learning_rate": 0.2535462100714555, + "loss": 0.1201, + "num_input_tokens_seen": 9318352, + "step": 10300 + }, + { + "epoch": 2.7198099511680085, + "grad_norm": 0.003093257313594222, + "learning_rate": 0.2535035835038066, + "loss": 0.1348, + "num_input_tokens_seen": 9323088, + "step": 10305 + }, + { + "epoch": 2.7211297347235055, + "grad_norm": 0.0029659168794751167, + "learning_rate": 0.2534609409746044, + "loss": 0.1114, + "num_input_tokens_seen": 9327728, + "step": 10310 + }, + { + "epoch": 2.722449518279002, + "grad_norm": 0.003122305730357766, + "learning_rate": 0.253418282490425, + "loss": 0.1394, + "num_input_tokens_seen": 9332208, + "step": 10315 + }, + { + "epoch": 2.723769301834499, + "grad_norm": 0.0019507682882249355, + "learning_rate": 0.2533756080578467, + "loss": 0.1229, + "num_input_tokens_seen": 9336816, + "step": 10320 + }, + { + "epoch": 2.725089085389996, + "grad_norm": 0.0017600104911252856, + "learning_rate": 0.25333291768345056, + "loss": 0.112, + "num_input_tokens_seen": 9340976, + "step": 10325 + }, + { + "epoch": 2.726408868945493, + "grad_norm": 0.0015480074798688293, + "learning_rate": 0.25329021137381996, + "loss": 0.1118, + "num_input_tokens_seen": 9345168, + "step": 10330 + }, + { + "epoch": 2.72772865250099, + "grad_norm": 0.0038964932318776846, + "learning_rate": 0.25324748913554074, + "loss": 0.1189, + "num_input_tokens_seen": 9349680, + "step": 10335 + }, + { + "epoch": 2.7290484360564866, + "grad_norm": 0.0031296033412218094, + "learning_rate": 0.2532047509752013, + "loss": 0.0918, + "num_input_tokens_seen": 9354384, + "step": 10340 + }, + { + "epoch": 2.7303682196119836, + "grad_norm": 0.002969315042719245, + "learning_rate": 0.25316199689939217, + "loss": 0.1318, + "num_input_tokens_seen": 9358832, + "step": 10345 + }, + { + "epoch": 2.7316880031674806, + "grad_norm": 0.002994624665006995, + "learning_rate": 0.2531192269147068, + "loss": 0.118, + "num_input_tokens_seen": 9363600, + "step": 10350 + }, + { + "epoch": 2.733007786722977, + "grad_norm": 0.0026227596681565046, + "learning_rate": 0.2530764410277407, + "loss": 0.1105, + "num_input_tokens_seen": 9368400, + "step": 10355 + }, + { + "epoch": 2.734327570278474, + "grad_norm": 0.001381757203489542, + "learning_rate": 0.25303363924509203, + "loss": 0.0671, + "num_input_tokens_seen": 9372880, + "step": 10360 + }, + { + "epoch": 2.735647353833971, + "grad_norm": 0.0025098691694438457, + "learning_rate": 0.25299082157336145, + "loss": 0.1161, + "num_input_tokens_seen": 9377968, + "step": 10365 + }, + { + "epoch": 2.736967137389468, + "grad_norm": 0.004348110873252153, + "learning_rate": 0.2529479880191519, + "loss": 0.1289, + "num_input_tokens_seen": 9382480, + "step": 10370 + }, + { + "epoch": 2.738286920944965, + "grad_norm": 0.0014192098751664162, + "learning_rate": 0.2529051385890689, + "loss": 0.1189, + "num_input_tokens_seen": 9386992, + "step": 10375 + }, + { + "epoch": 2.7396067045004617, + "grad_norm": 0.0028819371946156025, + "learning_rate": 0.2528622732897203, + "loss": 0.1033, + "num_input_tokens_seen": 9391696, + "step": 10380 + }, + { + "epoch": 2.7409264880559587, + "grad_norm": 0.00211556744761765, + "learning_rate": 0.25281939212771654, + "loss": 0.0791, + "num_input_tokens_seen": 9396208, + "step": 10385 + }, + { + "epoch": 2.7422462716114557, + "grad_norm": 0.001116443076170981, + "learning_rate": 0.2527764951096704, + "loss": 0.1111, + "num_input_tokens_seen": 9400496, + "step": 10390 + }, + { + "epoch": 2.7435660551669527, + "grad_norm": 0.0031893369741737843, + "learning_rate": 0.2527335822421971, + "loss": 0.1205, + "num_input_tokens_seen": 9404976, + "step": 10395 + }, + { + "epoch": 2.7448858387224497, + "grad_norm": 0.003814436960965395, + "learning_rate": 0.25269065353191444, + "loss": 0.134, + "num_input_tokens_seen": 9409616, + "step": 10400 + }, + { + "epoch": 2.7448858387224497, + "eval_loss": 0.11261699348688126, + "eval_runtime": 75.9723, + "eval_samples_per_second": 88.651, + "eval_steps_per_second": 22.166, + "num_input_tokens_seen": 9409616, + "step": 10400 + }, + { + "epoch": 2.7462056222779463, + "grad_norm": 0.0024597609881311655, + "learning_rate": 0.2526477089854425, + "loss": 0.153, + "num_input_tokens_seen": 9414384, + "step": 10405 + }, + { + "epoch": 2.7475254058334433, + "grad_norm": 0.002058603335171938, + "learning_rate": 0.25260474860940385, + "loss": 0.0895, + "num_input_tokens_seen": 9418832, + "step": 10410 + }, + { + "epoch": 2.7488451893889403, + "grad_norm": 0.0018580735195428133, + "learning_rate": 0.2525617724104236, + "loss": 0.1123, + "num_input_tokens_seen": 9422928, + "step": 10415 + }, + { + "epoch": 2.750164972944437, + "grad_norm": 0.0029570283368229866, + "learning_rate": 0.25251878039512915, + "loss": 0.1141, + "num_input_tokens_seen": 9427312, + "step": 10420 + }, + { + "epoch": 2.751484756499934, + "grad_norm": 0.002271167002618313, + "learning_rate": 0.25247577257015047, + "loss": 0.0807, + "num_input_tokens_seen": 9432112, + "step": 10425 + }, + { + "epoch": 2.752804540055431, + "grad_norm": 0.002509208396077156, + "learning_rate": 0.2524327489421198, + "loss": 0.0931, + "num_input_tokens_seen": 9436528, + "step": 10430 + }, + { + "epoch": 2.754124323610928, + "grad_norm": 0.001017126371152699, + "learning_rate": 0.25238970951767203, + "loss": 0.0709, + "num_input_tokens_seen": 9440976, + "step": 10435 + }, + { + "epoch": 2.755444107166425, + "grad_norm": 0.0019480135524645448, + "learning_rate": 0.25234665430344433, + "loss": 0.0894, + "num_input_tokens_seen": 9445616, + "step": 10440 + }, + { + "epoch": 2.7567638907219214, + "grad_norm": 0.0045744250528514385, + "learning_rate": 0.2523035833060764, + "loss": 0.126, + "num_input_tokens_seen": 9450320, + "step": 10445 + }, + { + "epoch": 2.7580836742774184, + "grad_norm": 0.0016278170514851809, + "learning_rate": 0.2522604965322103, + "loss": 0.0777, + "num_input_tokens_seen": 9454768, + "step": 10450 + }, + { + "epoch": 2.7594034578329154, + "grad_norm": 0.002287917770445347, + "learning_rate": 0.25221739398849047, + "loss": 0.0838, + "num_input_tokens_seen": 9459312, + "step": 10455 + }, + { + "epoch": 2.7607232413884124, + "grad_norm": 0.0007883376674726605, + "learning_rate": 0.252174275681564, + "loss": 0.1152, + "num_input_tokens_seen": 9464240, + "step": 10460 + }, + { + "epoch": 2.7620430249439094, + "grad_norm": 0.003529148641973734, + "learning_rate": 0.2521311416180802, + "loss": 0.0763, + "num_input_tokens_seen": 9468880, + "step": 10465 + }, + { + "epoch": 2.763362808499406, + "grad_norm": 0.0019396194256842136, + "learning_rate": 0.25208799180469094, + "loss": 0.0999, + "num_input_tokens_seen": 9473392, + "step": 10470 + }, + { + "epoch": 2.764682592054903, + "grad_norm": 0.0023582330904901028, + "learning_rate": 0.2520448262480504, + "loss": 0.1425, + "num_input_tokens_seen": 9477872, + "step": 10475 + }, + { + "epoch": 2.7660023756104, + "grad_norm": 0.002771007362753153, + "learning_rate": 0.25200164495481525, + "loss": 0.1573, + "num_input_tokens_seen": 9482256, + "step": 10480 + }, + { + "epoch": 2.7673221591658965, + "grad_norm": 0.0021915228571742773, + "learning_rate": 0.25195844793164474, + "loss": 0.0878, + "num_input_tokens_seen": 9486992, + "step": 10485 + }, + { + "epoch": 2.768641942721394, + "grad_norm": 0.0022177807986736298, + "learning_rate": 0.2519152351852001, + "loss": 0.1074, + "num_input_tokens_seen": 9491216, + "step": 10490 + }, + { + "epoch": 2.7699617262768905, + "grad_norm": 0.0023580610286444426, + "learning_rate": 0.25187200672214555, + "loss": 0.1039, + "num_input_tokens_seen": 9495888, + "step": 10495 + }, + { + "epoch": 2.7712815098323875, + "grad_norm": 0.0021326381247490644, + "learning_rate": 0.2518287625491473, + "loss": 0.0954, + "num_input_tokens_seen": 9500240, + "step": 10500 + }, + { + "epoch": 2.7726012933878845, + "grad_norm": 0.0016925862291827798, + "learning_rate": 0.25178550267287425, + "loss": 0.1034, + "num_input_tokens_seen": 9504976, + "step": 10505 + }, + { + "epoch": 2.773921076943381, + "grad_norm": 0.003522614249959588, + "learning_rate": 0.2517422270999976, + "loss": 0.0966, + "num_input_tokens_seen": 9509520, + "step": 10510 + }, + { + "epoch": 2.775240860498878, + "grad_norm": 0.002260446548461914, + "learning_rate": 0.2516989358371909, + "loss": 0.0942, + "num_input_tokens_seen": 9514288, + "step": 10515 + }, + { + "epoch": 2.776560644054375, + "grad_norm": 0.001443330547772348, + "learning_rate": 0.25165562889113025, + "loss": 0.1091, + "num_input_tokens_seen": 9518864, + "step": 10520 + }, + { + "epoch": 2.777880427609872, + "grad_norm": 0.0013600134989246726, + "learning_rate": 0.2516123062684942, + "loss": 0.1252, + "num_input_tokens_seen": 9523248, + "step": 10525 + }, + { + "epoch": 2.779200211165369, + "grad_norm": 0.0012073337566107512, + "learning_rate": 0.25156896797596356, + "loss": 0.0801, + "num_input_tokens_seen": 9527600, + "step": 10530 + }, + { + "epoch": 2.7805199947208656, + "grad_norm": 0.004558257292956114, + "learning_rate": 0.2515256140202216, + "loss": 0.1362, + "num_input_tokens_seen": 9532496, + "step": 10535 + }, + { + "epoch": 2.7818397782763626, + "grad_norm": 0.00261182920075953, + "learning_rate": 0.25148224440795425, + "loss": 0.0844, + "num_input_tokens_seen": 9536784, + "step": 10540 + }, + { + "epoch": 2.7831595618318596, + "grad_norm": 0.0027767950668931007, + "learning_rate": 0.2514388591458494, + "loss": 0.0831, + "num_input_tokens_seen": 9540752, + "step": 10545 + }, + { + "epoch": 2.784479345387356, + "grad_norm": 0.0014027862343937159, + "learning_rate": 0.2513954582405977, + "loss": 0.0912, + "num_input_tokens_seen": 9545264, + "step": 10550 + }, + { + "epoch": 2.7857991289428536, + "grad_norm": 0.0032880178187042475, + "learning_rate": 0.2513520416988922, + "loss": 0.1367, + "num_input_tokens_seen": 9549616, + "step": 10555 + }, + { + "epoch": 2.78711891249835, + "grad_norm": 0.0012686720583587885, + "learning_rate": 0.2513086095274281, + "loss": 0.0671, + "num_input_tokens_seen": 9554448, + "step": 10560 + }, + { + "epoch": 2.788438696053847, + "grad_norm": 0.0032105795107781887, + "learning_rate": 0.25126516173290336, + "loss": 0.1098, + "num_input_tokens_seen": 9558768, + "step": 10565 + }, + { + "epoch": 2.789758479609344, + "grad_norm": 0.002815091283991933, + "learning_rate": 0.2512216983220181, + "loss": 0.1115, + "num_input_tokens_seen": 9563440, + "step": 10570 + }, + { + "epoch": 2.7910782631648408, + "grad_norm": 0.0026922242250293493, + "learning_rate": 0.25117821930147494, + "loss": 0.0918, + "num_input_tokens_seen": 9568144, + "step": 10575 + }, + { + "epoch": 2.7923980467203378, + "grad_norm": 0.0028793918900191784, + "learning_rate": 0.2511347246779788, + "loss": 0.0694, + "num_input_tokens_seen": 9572528, + "step": 10580 + }, + { + "epoch": 2.7937178302758348, + "grad_norm": 0.0022854870185256004, + "learning_rate": 0.25109121445823723, + "loss": 0.1067, + "num_input_tokens_seen": 9576976, + "step": 10585 + }, + { + "epoch": 2.7950376138313318, + "grad_norm": 0.0034476283472031355, + "learning_rate": 0.25104768864896004, + "loss": 0.1146, + "num_input_tokens_seen": 9581392, + "step": 10590 + }, + { + "epoch": 2.7963573973868288, + "grad_norm": 0.0023601052816957235, + "learning_rate": 0.2510041472568594, + "loss": 0.0822, + "num_input_tokens_seen": 9586192, + "step": 10595 + }, + { + "epoch": 2.7976771809423253, + "grad_norm": 0.002992014167830348, + "learning_rate": 0.25096059028864987, + "loss": 0.1187, + "num_input_tokens_seen": 9590384, + "step": 10600 + }, + { + "epoch": 2.7976771809423253, + "eval_loss": 0.10502682626247406, + "eval_runtime": 75.8535, + "eval_samples_per_second": 88.79, + "eval_steps_per_second": 22.201, + "num_input_tokens_seen": 9590384, + "step": 10600 + }, + { + "epoch": 2.7989969644978223, + "grad_norm": 0.002797376364469528, + "learning_rate": 0.25091701775104863, + "loss": 0.1147, + "num_input_tokens_seen": 9594704, + "step": 10605 + }, + { + "epoch": 2.8003167480533193, + "grad_norm": 0.0027405216824263334, + "learning_rate": 0.250873429650775, + "loss": 0.1314, + "num_input_tokens_seen": 9599152, + "step": 10610 + }, + { + "epoch": 2.8016365316088163, + "grad_norm": 0.0022110450081527233, + "learning_rate": 0.25082982599455095, + "loss": 0.0779, + "num_input_tokens_seen": 9603920, + "step": 10615 + }, + { + "epoch": 2.8029563151643133, + "grad_norm": 0.0018497237470000982, + "learning_rate": 0.2507862067891006, + "loss": 0.0866, + "num_input_tokens_seen": 9608784, + "step": 10620 + }, + { + "epoch": 2.80427609871981, + "grad_norm": 0.002063186839222908, + "learning_rate": 0.25074257204115064, + "loss": 0.1019, + "num_input_tokens_seen": 9613072, + "step": 10625 + }, + { + "epoch": 2.805595882275307, + "grad_norm": 0.003598355920985341, + "learning_rate": 0.25069892175742997, + "loss": 0.1328, + "num_input_tokens_seen": 9617552, + "step": 10630 + }, + { + "epoch": 2.806915665830804, + "grad_norm": 0.002380897756665945, + "learning_rate": 0.25065525594467014, + "loss": 0.1033, + "num_input_tokens_seen": 9622000, + "step": 10635 + }, + { + "epoch": 2.8082354493863004, + "grad_norm": 0.0015158269088715315, + "learning_rate": 0.2506115746096049, + "loss": 0.0958, + "num_input_tokens_seen": 9626512, + "step": 10640 + }, + { + "epoch": 2.8095552329417974, + "grad_norm": 0.0021895894315093756, + "learning_rate": 0.25056787775897055, + "loss": 0.0705, + "num_input_tokens_seen": 9631184, + "step": 10645 + }, + { + "epoch": 2.8108750164972944, + "grad_norm": 0.0013861993793398142, + "learning_rate": 0.2505241653995056, + "loss": 0.1011, + "num_input_tokens_seen": 9635760, + "step": 10650 + }, + { + "epoch": 2.8121948000527914, + "grad_norm": 0.0025914344005286694, + "learning_rate": 0.25048043753795113, + "loss": 0.1063, + "num_input_tokens_seen": 9640368, + "step": 10655 + }, + { + "epoch": 2.8135145836082884, + "grad_norm": 0.001856186892837286, + "learning_rate": 0.2504366941810504, + "loss": 0.0894, + "num_input_tokens_seen": 9644848, + "step": 10660 + }, + { + "epoch": 2.814834367163785, + "grad_norm": 0.0033947492484003305, + "learning_rate": 0.2503929353355493, + "loss": 0.1323, + "num_input_tokens_seen": 9649584, + "step": 10665 + }, + { + "epoch": 2.816154150719282, + "grad_norm": 0.002619533333927393, + "learning_rate": 0.250349161008196, + "loss": 0.1174, + "num_input_tokens_seen": 9654096, + "step": 10670 + }, + { + "epoch": 2.817473934274779, + "grad_norm": 0.002142460085451603, + "learning_rate": 0.2503053712057409, + "loss": 0.108, + "num_input_tokens_seen": 9658768, + "step": 10675 + }, + { + "epoch": 2.818793717830276, + "grad_norm": 0.0036749853752553463, + "learning_rate": 0.25026156593493715, + "loss": 0.0839, + "num_input_tokens_seen": 9663248, + "step": 10680 + }, + { + "epoch": 2.820113501385773, + "grad_norm": 0.0022474126890301704, + "learning_rate": 0.2502177452025399, + "loss": 0.0925, + "num_input_tokens_seen": 9667536, + "step": 10685 + }, + { + "epoch": 2.8214332849412695, + "grad_norm": 0.00284402328543365, + "learning_rate": 0.25017390901530695, + "loss": 0.1097, + "num_input_tokens_seen": 9672016, + "step": 10690 + }, + { + "epoch": 2.8227530684967665, + "grad_norm": 0.000727789243683219, + "learning_rate": 0.2501300573799984, + "loss": 0.0888, + "num_input_tokens_seen": 9676784, + "step": 10695 + }, + { + "epoch": 2.8240728520522635, + "grad_norm": 0.0023193489760160446, + "learning_rate": 0.2500861903033766, + "loss": 0.0882, + "num_input_tokens_seen": 9681360, + "step": 10700 + }, + { + "epoch": 2.82539263560776, + "grad_norm": 0.003285621525719762, + "learning_rate": 0.25004230779220654, + "loss": 0.1129, + "num_input_tokens_seen": 9685712, + "step": 10705 + }, + { + "epoch": 2.826712419163257, + "grad_norm": 0.002039738465100527, + "learning_rate": 0.24999840985325542, + "loss": 0.1613, + "num_input_tokens_seen": 9690000, + "step": 10710 + }, + { + "epoch": 2.828032202718754, + "grad_norm": 0.00192659767344594, + "learning_rate": 0.24995449649329285, + "loss": 0.1522, + "num_input_tokens_seen": 9694896, + "step": 10715 + }, + { + "epoch": 2.829351986274251, + "grad_norm": 0.0038384241051971912, + "learning_rate": 0.2499105677190908, + "loss": 0.1251, + "num_input_tokens_seen": 9699568, + "step": 10720 + }, + { + "epoch": 2.830671769829748, + "grad_norm": 0.0021198317408561707, + "learning_rate": 0.24986662353742364, + "loss": 0.0691, + "num_input_tokens_seen": 9703888, + "step": 10725 + }, + { + "epoch": 2.8319915533852447, + "grad_norm": 0.00194803427439183, + "learning_rate": 0.24982266395506814, + "loss": 0.1008, + "num_input_tokens_seen": 9708496, + "step": 10730 + }, + { + "epoch": 2.8333113369407417, + "grad_norm": 0.004295907914638519, + "learning_rate": 0.2497786889788034, + "loss": 0.1243, + "num_input_tokens_seen": 9713168, + "step": 10735 + }, + { + "epoch": 2.8346311204962387, + "grad_norm": 0.002659986261278391, + "learning_rate": 0.24973469861541095, + "loss": 0.1099, + "num_input_tokens_seen": 9717872, + "step": 10740 + }, + { + "epoch": 2.8359509040517357, + "grad_norm": 0.002557021100074053, + "learning_rate": 0.24969069287167456, + "loss": 0.1036, + "num_input_tokens_seen": 9722640, + "step": 10745 + }, + { + "epoch": 2.8372706876072327, + "grad_norm": 0.0014965953305363655, + "learning_rate": 0.2496466717543806, + "loss": 0.1146, + "num_input_tokens_seen": 9727408, + "step": 10750 + }, + { + "epoch": 2.8385904711627292, + "grad_norm": 0.0016433894634246826, + "learning_rate": 0.24960263527031762, + "loss": 0.0794, + "num_input_tokens_seen": 9731792, + "step": 10755 + }, + { + "epoch": 2.8399102547182262, + "grad_norm": 0.0020074816420674324, + "learning_rate": 0.24955858342627657, + "loss": 0.0801, + "num_input_tokens_seen": 9736208, + "step": 10760 + }, + { + "epoch": 2.8412300382737232, + "grad_norm": 0.001922260271385312, + "learning_rate": 0.24951451622905083, + "loss": 0.0922, + "num_input_tokens_seen": 9740560, + "step": 10765 + }, + { + "epoch": 2.84254982182922, + "grad_norm": 0.003254848998039961, + "learning_rate": 0.24947043368543612, + "loss": 0.0861, + "num_input_tokens_seen": 9745552, + "step": 10770 + }, + { + "epoch": 2.843869605384717, + "grad_norm": 0.002612157491967082, + "learning_rate": 0.2494263358022305, + "loss": 0.0566, + "num_input_tokens_seen": 9750288, + "step": 10775 + }, + { + "epoch": 2.845189388940214, + "grad_norm": 0.0016806789208203554, + "learning_rate": 0.24938222258623444, + "loss": 0.1003, + "num_input_tokens_seen": 9754672, + "step": 10780 + }, + { + "epoch": 2.846509172495711, + "grad_norm": 0.003362977644428611, + "learning_rate": 0.24933809404425075, + "loss": 0.1276, + "num_input_tokens_seen": 9759152, + "step": 10785 + }, + { + "epoch": 2.847828956051208, + "grad_norm": 0.00339716044254601, + "learning_rate": 0.24929395018308453, + "loss": 0.0906, + "num_input_tokens_seen": 9763664, + "step": 10790 + }, + { + "epoch": 2.8491487396067043, + "grad_norm": 0.0017062561819329858, + "learning_rate": 0.24924979100954348, + "loss": 0.0995, + "num_input_tokens_seen": 9768112, + "step": 10795 + }, + { + "epoch": 2.8504685231622013, + "grad_norm": 0.000691285589709878, + "learning_rate": 0.24920561653043735, + "loss": 0.0766, + "num_input_tokens_seen": 9772624, + "step": 10800 + }, + { + "epoch": 2.8504685231622013, + "eval_loss": 0.11549827456474304, + "eval_runtime": 75.9332, + "eval_samples_per_second": 88.696, + "eval_steps_per_second": 22.177, + "num_input_tokens_seen": 9772624, + "step": 10800 + }, + { + "epoch": 2.8517883067176983, + "grad_norm": 0.0010336698032915592, + "learning_rate": 0.24916142675257846, + "loss": 0.1436, + "num_input_tokens_seen": 9777136, + "step": 10805 + }, + { + "epoch": 2.8531080902731953, + "grad_norm": 0.0032784794457256794, + "learning_rate": 0.24911722168278144, + "loss": 0.084, + "num_input_tokens_seen": 9781840, + "step": 10810 + }, + { + "epoch": 2.8544278738286923, + "grad_norm": 0.002595463301986456, + "learning_rate": 0.24907300132786328, + "loss": 0.079, + "num_input_tokens_seen": 9786608, + "step": 10815 + }, + { + "epoch": 2.855747657384189, + "grad_norm": 0.0013305323664098978, + "learning_rate": 0.24902876569464322, + "loss": 0.0912, + "num_input_tokens_seen": 9791088, + "step": 10820 + }, + { + "epoch": 2.857067440939686, + "grad_norm": 0.0018013173248618841, + "learning_rate": 0.24898451478994305, + "loss": 0.0834, + "num_input_tokens_seen": 9795760, + "step": 10825 + }, + { + "epoch": 2.858387224495183, + "grad_norm": 0.002459738403558731, + "learning_rate": 0.2489402486205868, + "loss": 0.0794, + "num_input_tokens_seen": 9800112, + "step": 10830 + }, + { + "epoch": 2.8597070080506795, + "grad_norm": 0.0015350400935858488, + "learning_rate": 0.24889596719340085, + "loss": 0.1262, + "num_input_tokens_seen": 9804656, + "step": 10835 + }, + { + "epoch": 2.8610267916061765, + "grad_norm": 0.002996224444359541, + "learning_rate": 0.24885167051521392, + "loss": 0.0815, + "num_input_tokens_seen": 9809072, + "step": 10840 + }, + { + "epoch": 2.8623465751616735, + "grad_norm": 0.00314038316719234, + "learning_rate": 0.24880735859285716, + "loss": 0.0709, + "num_input_tokens_seen": 9813904, + "step": 10845 + }, + { + "epoch": 2.8636663587171705, + "grad_norm": 0.0020913435146212578, + "learning_rate": 0.24876303143316406, + "loss": 0.0792, + "num_input_tokens_seen": 9818288, + "step": 10850 + }, + { + "epoch": 2.8649861422726675, + "grad_norm": 0.005221430212259293, + "learning_rate": 0.24871868904297031, + "loss": 0.0894, + "num_input_tokens_seen": 9822832, + "step": 10855 + }, + { + "epoch": 2.866305925828164, + "grad_norm": 0.0018409204203635454, + "learning_rate": 0.24867433142911416, + "loss": 0.065, + "num_input_tokens_seen": 9827408, + "step": 10860 + }, + { + "epoch": 2.867625709383661, + "grad_norm": 0.002792896470054984, + "learning_rate": 0.24862995859843612, + "loss": 0.1492, + "num_input_tokens_seen": 9831888, + "step": 10865 + }, + { + "epoch": 2.868945492939158, + "grad_norm": 0.0014137030811980367, + "learning_rate": 0.24858557055777897, + "loss": 0.0702, + "num_input_tokens_seen": 9836624, + "step": 10870 + }, + { + "epoch": 2.870265276494655, + "grad_norm": 0.0027496819384396076, + "learning_rate": 0.24854116731398793, + "loss": 0.1399, + "num_input_tokens_seen": 9841040, + "step": 10875 + }, + { + "epoch": 2.871585060050152, + "grad_norm": 0.002451092703267932, + "learning_rate": 0.24849674887391052, + "loss": 0.1162, + "num_input_tokens_seen": 9845648, + "step": 10880 + }, + { + "epoch": 2.8729048436056486, + "grad_norm": 0.003639503847807646, + "learning_rate": 0.2484523152443967, + "loss": 0.1254, + "num_input_tokens_seen": 9850288, + "step": 10885 + }, + { + "epoch": 2.8742246271611456, + "grad_norm": 0.0017957595409825444, + "learning_rate": 0.24840786643229862, + "loss": 0.1237, + "num_input_tokens_seen": 9854864, + "step": 10890 + }, + { + "epoch": 2.8755444107166426, + "grad_norm": 0.003362057264894247, + "learning_rate": 0.2483634024444709, + "loss": 0.1289, + "num_input_tokens_seen": 9859184, + "step": 10895 + }, + { + "epoch": 2.876864194272139, + "grad_norm": 0.002703318605199456, + "learning_rate": 0.24831892328777033, + "loss": 0.1187, + "num_input_tokens_seen": 9863728, + "step": 10900 + }, + { + "epoch": 2.878183977827636, + "grad_norm": 0.002705493476241827, + "learning_rate": 0.2482744289690563, + "loss": 0.1214, + "num_input_tokens_seen": 9868048, + "step": 10905 + }, + { + "epoch": 2.879503761383133, + "grad_norm": 0.0016176797216758132, + "learning_rate": 0.2482299194951903, + "loss": 0.0962, + "num_input_tokens_seen": 9872784, + "step": 10910 + }, + { + "epoch": 2.88082354493863, + "grad_norm": 0.0029487828724086285, + "learning_rate": 0.2481853948730363, + "loss": 0.098, + "num_input_tokens_seen": 9877264, + "step": 10915 + }, + { + "epoch": 2.882143328494127, + "grad_norm": 0.0027825520373880863, + "learning_rate": 0.24814085510946052, + "loss": 0.1087, + "num_input_tokens_seen": 9881776, + "step": 10920 + }, + { + "epoch": 2.8834631120496237, + "grad_norm": 0.0018484571482986212, + "learning_rate": 0.24809630021133158, + "loss": 0.1059, + "num_input_tokens_seen": 9886480, + "step": 10925 + }, + { + "epoch": 2.8847828956051207, + "grad_norm": 0.001098884386010468, + "learning_rate": 0.24805173018552037, + "loss": 0.1371, + "num_input_tokens_seen": 9890960, + "step": 10930 + }, + { + "epoch": 2.8861026791606177, + "grad_norm": 0.0036653922870755196, + "learning_rate": 0.2480071450389002, + "loss": 0.0958, + "num_input_tokens_seen": 9895312, + "step": 10935 + }, + { + "epoch": 2.8874224627161147, + "grad_norm": 0.002720700576901436, + "learning_rate": 0.24796254477834662, + "loss": 0.1053, + "num_input_tokens_seen": 9899568, + "step": 10940 + }, + { + "epoch": 2.8887422462716117, + "grad_norm": 0.0019169056322425604, + "learning_rate": 0.24791792941073754, + "loss": 0.1172, + "num_input_tokens_seen": 9904272, + "step": 10945 + }, + { + "epoch": 2.8900620298271082, + "grad_norm": 0.00199909252114594, + "learning_rate": 0.2478732989429533, + "loss": 0.0759, + "num_input_tokens_seen": 9908848, + "step": 10950 + }, + { + "epoch": 2.8913818133826052, + "grad_norm": 0.002728394465520978, + "learning_rate": 0.24782865338187632, + "loss": 0.0745, + "num_input_tokens_seen": 9913168, + "step": 10955 + }, + { + "epoch": 2.8927015969381022, + "grad_norm": 0.0020545783918350935, + "learning_rate": 0.2477839927343916, + "loss": 0.115, + "num_input_tokens_seen": 9917712, + "step": 10960 + }, + { + "epoch": 2.894021380493599, + "grad_norm": 0.0035000364296138287, + "learning_rate": 0.2477393170073864, + "loss": 0.1092, + "num_input_tokens_seen": 9922256, + "step": 10965 + }, + { + "epoch": 2.895341164049096, + "grad_norm": 0.0035360832698643208, + "learning_rate": 0.2476946262077503, + "loss": 0.1066, + "num_input_tokens_seen": 9926800, + "step": 10970 + }, + { + "epoch": 2.896660947604593, + "grad_norm": 0.003984067589044571, + "learning_rate": 0.24764992034237507, + "loss": 0.1632, + "num_input_tokens_seen": 9931632, + "step": 10975 + }, + { + "epoch": 2.89798073116009, + "grad_norm": 0.0030774325132369995, + "learning_rate": 0.24760519941815498, + "loss": 0.1424, + "num_input_tokens_seen": 9936208, + "step": 10980 + }, + { + "epoch": 2.899300514715587, + "grad_norm": 0.002989324973896146, + "learning_rate": 0.2475604634419866, + "loss": 0.101, + "num_input_tokens_seen": 9940432, + "step": 10985 + }, + { + "epoch": 2.9006202982710834, + "grad_norm": 0.0014826551778241992, + "learning_rate": 0.24751571242076872, + "loss": 0.0781, + "num_input_tokens_seen": 9944720, + "step": 10990 + }, + { + "epoch": 2.9019400818265804, + "grad_norm": 0.00235609314404428, + "learning_rate": 0.2474709463614025, + "loss": 0.1126, + "num_input_tokens_seen": 9949392, + "step": 10995 + }, + { + "epoch": 2.9032598653820774, + "grad_norm": 0.0026969232130795717, + "learning_rate": 0.24742616527079145, + "loss": 0.0848, + "num_input_tokens_seen": 9953712, + "step": 11000 + }, + { + "epoch": 2.9032598653820774, + "eval_loss": 0.1032017394900322, + "eval_runtime": 75.6113, + "eval_samples_per_second": 89.074, + "eval_steps_per_second": 22.272, + "num_input_tokens_seen": 9953712, + "step": 11000 + }, + { + "epoch": 2.9045796489375744, + "grad_norm": 0.0014075902290642262, + "learning_rate": 0.24738136915584139, + "loss": 0.0934, + "num_input_tokens_seen": 9958096, + "step": 11005 + }, + { + "epoch": 2.9058994324930714, + "grad_norm": 0.004699875600636005, + "learning_rate": 0.24733655802346047, + "loss": 0.1233, + "num_input_tokens_seen": 9962576, + "step": 11010 + }, + { + "epoch": 2.907219216048568, + "grad_norm": 0.0034985437523573637, + "learning_rate": 0.24729173188055906, + "loss": 0.1495, + "num_input_tokens_seen": 9966864, + "step": 11015 + }, + { + "epoch": 2.908538999604065, + "grad_norm": 0.0018476562108844519, + "learning_rate": 0.24724689073404996, + "loss": 0.0988, + "num_input_tokens_seen": 9971408, + "step": 11020 + }, + { + "epoch": 2.909858783159562, + "grad_norm": 0.0015899871941655874, + "learning_rate": 0.24720203459084822, + "loss": 0.127, + "num_input_tokens_seen": 9976080, + "step": 11025 + }, + { + "epoch": 2.9111785667150585, + "grad_norm": 0.0034055537544190884, + "learning_rate": 0.24715716345787123, + "loss": 0.0931, + "num_input_tokens_seen": 9980400, + "step": 11030 + }, + { + "epoch": 2.9124983502705555, + "grad_norm": 0.0034832987003028393, + "learning_rate": 0.2471122773420387, + "loss": 0.1124, + "num_input_tokens_seen": 9985040, + "step": 11035 + }, + { + "epoch": 2.9138181338260525, + "grad_norm": 0.0022114841267466545, + "learning_rate": 0.24706737625027259, + "loss": 0.1312, + "num_input_tokens_seen": 9989424, + "step": 11040 + }, + { + "epoch": 2.9151379173815495, + "grad_norm": 0.0021704293321818113, + "learning_rate": 0.24702246018949725, + "loss": 0.1077, + "num_input_tokens_seen": 9994064, + "step": 11045 + }, + { + "epoch": 2.9164577009370465, + "grad_norm": 0.003225448541343212, + "learning_rate": 0.2469775291666393, + "loss": 0.1192, + "num_input_tokens_seen": 9998512, + "step": 11050 + }, + { + "epoch": 2.917777484492543, + "grad_norm": 0.002987730083987117, + "learning_rate": 0.24693258318862765, + "loss": 0.0957, + "num_input_tokens_seen": 10003056, + "step": 11055 + }, + { + "epoch": 2.91909726804804, + "grad_norm": 0.0007725092582404613, + "learning_rate": 0.2468876222623935, + "loss": 0.0901, + "num_input_tokens_seen": 10007504, + "step": 11060 + }, + { + "epoch": 2.920417051603537, + "grad_norm": 0.0016511237481608987, + "learning_rate": 0.2468426463948705, + "loss": 0.1183, + "num_input_tokens_seen": 10012048, + "step": 11065 + }, + { + "epoch": 2.921736835159034, + "grad_norm": 0.0012735077179968357, + "learning_rate": 0.24679765559299438, + "loss": 0.1122, + "num_input_tokens_seen": 10016528, + "step": 11070 + }, + { + "epoch": 2.923056618714531, + "grad_norm": 0.0021662660874426365, + "learning_rate": 0.24675264986370332, + "loss": 0.0974, + "num_input_tokens_seen": 10021296, + "step": 11075 + }, + { + "epoch": 2.9243764022700276, + "grad_norm": 0.0022799663711339235, + "learning_rate": 0.2467076292139378, + "loss": 0.1331, + "num_input_tokens_seen": 10025776, + "step": 11080 + }, + { + "epoch": 2.9256961858255246, + "grad_norm": 0.001561889541335404, + "learning_rate": 0.24666259365064055, + "loss": 0.0762, + "num_input_tokens_seen": 10030256, + "step": 11085 + }, + { + "epoch": 2.9270159693810216, + "grad_norm": 0.002095187548547983, + "learning_rate": 0.24661754318075663, + "loss": 0.071, + "num_input_tokens_seen": 10034864, + "step": 11090 + }, + { + "epoch": 2.928335752936518, + "grad_norm": 0.002989549422636628, + "learning_rate": 0.2465724778112334, + "loss": 0.1344, + "num_input_tokens_seen": 10039056, + "step": 11095 + }, + { + "epoch": 2.929655536492015, + "grad_norm": 0.002747986698523164, + "learning_rate": 0.24652739754902042, + "loss": 0.0889, + "num_input_tokens_seen": 10043440, + "step": 11100 + }, + { + "epoch": 2.930975320047512, + "grad_norm": 0.0015893286326900125, + "learning_rate": 0.24648230240106975, + "loss": 0.0963, + "num_input_tokens_seen": 10048144, + "step": 11105 + }, + { + "epoch": 2.932295103603009, + "grad_norm": 0.00364104937762022, + "learning_rate": 0.2464371923743356, + "loss": 0.1043, + "num_input_tokens_seen": 10052880, + "step": 11110 + }, + { + "epoch": 2.933614887158506, + "grad_norm": 0.0030339814256876707, + "learning_rate": 0.24639206747577444, + "loss": 0.0872, + "num_input_tokens_seen": 10057168, + "step": 11115 + }, + { + "epoch": 2.9349346707140027, + "grad_norm": 0.0016728974878787994, + "learning_rate": 0.24634692771234515, + "loss": 0.1526, + "num_input_tokens_seen": 10061136, + "step": 11120 + }, + { + "epoch": 2.9362544542694997, + "grad_norm": 0.0021966148633509874, + "learning_rate": 0.2463017730910088, + "loss": 0.1599, + "num_input_tokens_seen": 10065680, + "step": 11125 + }, + { + "epoch": 2.9375742378249967, + "grad_norm": 0.000840452266857028, + "learning_rate": 0.2462566036187289, + "loss": 0.0796, + "num_input_tokens_seen": 10070224, + "step": 11130 + }, + { + "epoch": 2.9388940213804937, + "grad_norm": 0.001667679287493229, + "learning_rate": 0.24621141930247106, + "loss": 0.1264, + "num_input_tokens_seen": 10074544, + "step": 11135 + }, + { + "epoch": 2.9402138049359907, + "grad_norm": 0.0019665095023810863, + "learning_rate": 0.2461662201492033, + "loss": 0.0886, + "num_input_tokens_seen": 10078992, + "step": 11140 + }, + { + "epoch": 2.9415335884914873, + "grad_norm": 0.0021254403982311487, + "learning_rate": 0.24612100616589586, + "loss": 0.1176, + "num_input_tokens_seen": 10083280, + "step": 11145 + }, + { + "epoch": 2.9428533720469843, + "grad_norm": 0.001611293526366353, + "learning_rate": 0.24607577735952135, + "loss": 0.1158, + "num_input_tokens_seen": 10087952, + "step": 11150 + }, + { + "epoch": 2.9441731556024813, + "grad_norm": 0.0022111120633780956, + "learning_rate": 0.24603053373705464, + "loss": 0.0929, + "num_input_tokens_seen": 10092528, + "step": 11155 + }, + { + "epoch": 2.945492939157978, + "grad_norm": 0.0011830955045297742, + "learning_rate": 0.2459852753054728, + "loss": 0.0822, + "num_input_tokens_seen": 10097040, + "step": 11160 + }, + { + "epoch": 2.946812722713475, + "grad_norm": 0.001387616852298379, + "learning_rate": 0.24594000207175526, + "loss": 0.075, + "num_input_tokens_seen": 10101648, + "step": 11165 + }, + { + "epoch": 2.948132506268972, + "grad_norm": 0.002089166548103094, + "learning_rate": 0.2458947140428838, + "loss": 0.0775, + "num_input_tokens_seen": 10105968, + "step": 11170 + }, + { + "epoch": 2.949452289824469, + "grad_norm": 0.002350672846660018, + "learning_rate": 0.24584941122584233, + "loss": 0.0872, + "num_input_tokens_seen": 10110448, + "step": 11175 + }, + { + "epoch": 2.950772073379966, + "grad_norm": 0.0023044326808303595, + "learning_rate": 0.24580409362761713, + "loss": 0.1378, + "num_input_tokens_seen": 10114736, + "step": 11180 + }, + { + "epoch": 2.9520918569354624, + "grad_norm": 0.002563697984442115, + "learning_rate": 0.2457587612551967, + "loss": 0.1198, + "num_input_tokens_seen": 10119376, + "step": 11185 + }, + { + "epoch": 2.9534116404909594, + "grad_norm": 0.001589059247635305, + "learning_rate": 0.24571341411557193, + "loss": 0.0917, + "num_input_tokens_seen": 10123856, + "step": 11190 + }, + { + "epoch": 2.9547314240464564, + "grad_norm": 0.002405752893537283, + "learning_rate": 0.2456680522157359, + "loss": 0.0804, + "num_input_tokens_seen": 10128336, + "step": 11195 + }, + { + "epoch": 2.9560512076019534, + "grad_norm": 0.002611771458759904, + "learning_rate": 0.245622675562684, + "loss": 0.1019, + "num_input_tokens_seen": 10132784, + "step": 11200 + }, + { + "epoch": 2.9560512076019534, + "eval_loss": 0.11019360274076462, + "eval_runtime": 75.8359, + "eval_samples_per_second": 88.81, + "eval_steps_per_second": 22.206, + "num_input_tokens_seen": 10132784, + "step": 11200 + }, + { + "epoch": 2.9573709911574504, + "grad_norm": 0.004434145987033844, + "learning_rate": 0.24557728416341384, + "loss": 0.1126, + "num_input_tokens_seen": 10137104, + "step": 11205 + }, + { + "epoch": 2.958690774712947, + "grad_norm": 0.003689659759402275, + "learning_rate": 0.24553187802492538, + "loss": 0.0884, + "num_input_tokens_seen": 10141744, + "step": 11210 + }, + { + "epoch": 2.960010558268444, + "grad_norm": 0.0015533862169831991, + "learning_rate": 0.24548645715422074, + "loss": 0.0968, + "num_input_tokens_seen": 10145968, + "step": 11215 + }, + { + "epoch": 2.961330341823941, + "grad_norm": 0.0025638288352638483, + "learning_rate": 0.2454410215583045, + "loss": 0.132, + "num_input_tokens_seen": 10150192, + "step": 11220 + }, + { + "epoch": 2.9626501253794375, + "grad_norm": 0.0024451622739434242, + "learning_rate": 0.24539557124418332, + "loss": 0.1183, + "num_input_tokens_seen": 10154992, + "step": 11225 + }, + { + "epoch": 2.9639699089349345, + "grad_norm": 0.004813474602997303, + "learning_rate": 0.24535010621886624, + "loss": 0.1, + "num_input_tokens_seen": 10159696, + "step": 11230 + }, + { + "epoch": 2.9652896924904315, + "grad_norm": 0.0014807750703766942, + "learning_rate": 0.2453046264893646, + "loss": 0.1144, + "num_input_tokens_seen": 10164080, + "step": 11235 + }, + { + "epoch": 2.9666094760459285, + "grad_norm": 0.003304550191387534, + "learning_rate": 0.24525913206269184, + "loss": 0.132, + "num_input_tokens_seen": 10168560, + "step": 11240 + }, + { + "epoch": 2.9679292596014255, + "grad_norm": 0.001453994307667017, + "learning_rate": 0.2452136229458638, + "loss": 0.0897, + "num_input_tokens_seen": 10172848, + "step": 11245 + }, + { + "epoch": 2.969249043156922, + "grad_norm": 0.0026391251012682915, + "learning_rate": 0.24516809914589857, + "loss": 0.1161, + "num_input_tokens_seen": 10177360, + "step": 11250 + }, + { + "epoch": 2.970568826712419, + "grad_norm": 0.0034092534333467484, + "learning_rate": 0.2451225606698165, + "loss": 0.1187, + "num_input_tokens_seen": 10181904, + "step": 11255 + }, + { + "epoch": 2.971888610267916, + "grad_norm": 0.0024328413419425488, + "learning_rate": 0.2450770075246402, + "loss": 0.0878, + "num_input_tokens_seen": 10186416, + "step": 11260 + }, + { + "epoch": 2.973208393823413, + "grad_norm": 0.003059874987229705, + "learning_rate": 0.24503143971739455, + "loss": 0.0712, + "num_input_tokens_seen": 10190960, + "step": 11265 + }, + { + "epoch": 2.97452817737891, + "grad_norm": 0.0016185074346140027, + "learning_rate": 0.24498585725510663, + "loss": 0.0909, + "num_input_tokens_seen": 10195440, + "step": 11270 + }, + { + "epoch": 2.9758479609344066, + "grad_norm": 0.0016809251392260194, + "learning_rate": 0.24494026014480583, + "loss": 0.1019, + "num_input_tokens_seen": 10199856, + "step": 11275 + }, + { + "epoch": 2.9771677444899036, + "grad_norm": 0.0014190090587362647, + "learning_rate": 0.24489464839352387, + "loss": 0.0606, + "num_input_tokens_seen": 10204624, + "step": 11280 + }, + { + "epoch": 2.9784875280454006, + "grad_norm": 0.002589572686702013, + "learning_rate": 0.2448490220082946, + "loss": 0.0864, + "num_input_tokens_seen": 10208848, + "step": 11285 + }, + { + "epoch": 2.979807311600897, + "grad_norm": 0.0015370434848591685, + "learning_rate": 0.24480338099615415, + "loss": 0.0881, + "num_input_tokens_seen": 10213296, + "step": 11290 + }, + { + "epoch": 2.9811270951563946, + "grad_norm": 0.0023262272588908672, + "learning_rate": 0.244757725364141, + "loss": 0.1245, + "num_input_tokens_seen": 10217840, + "step": 11295 + }, + { + "epoch": 2.982446878711891, + "grad_norm": 0.0012076270068064332, + "learning_rate": 0.24471205511929583, + "loss": 0.1092, + "num_input_tokens_seen": 10222320, + "step": 11300 + }, + { + "epoch": 2.983766662267388, + "grad_norm": 0.0019462740747258067, + "learning_rate": 0.24466637026866145, + "loss": 0.1423, + "num_input_tokens_seen": 10226928, + "step": 11305 + }, + { + "epoch": 2.985086445822885, + "grad_norm": 0.0028855877462774515, + "learning_rate": 0.2446206708192832, + "loss": 0.131, + "num_input_tokens_seen": 10231536, + "step": 11310 + }, + { + "epoch": 2.9864062293783817, + "grad_norm": 0.0014475068310275674, + "learning_rate": 0.2445749567782084, + "loss": 0.1309, + "num_input_tokens_seen": 10235920, + "step": 11315 + }, + { + "epoch": 2.9877260129338787, + "grad_norm": 0.0019872388802468777, + "learning_rate": 0.2445292281524868, + "loss": 0.1117, + "num_input_tokens_seen": 10240432, + "step": 11320 + }, + { + "epoch": 2.9890457964893757, + "grad_norm": 0.0016185471322387457, + "learning_rate": 0.24448348494917022, + "loss": 0.0954, + "num_input_tokens_seen": 10244688, + "step": 11325 + }, + { + "epoch": 2.9903655800448727, + "grad_norm": 0.001271720859222114, + "learning_rate": 0.24443772717531295, + "loss": 0.133, + "num_input_tokens_seen": 10248976, + "step": 11330 + }, + { + "epoch": 2.9916853636003697, + "grad_norm": 0.002258660038933158, + "learning_rate": 0.24439195483797138, + "loss": 0.107, + "num_input_tokens_seen": 10253360, + "step": 11335 + }, + { + "epoch": 2.9930051471558663, + "grad_norm": 0.0027147370856255293, + "learning_rate": 0.24434616794420416, + "loss": 0.1128, + "num_input_tokens_seen": 10257968, + "step": 11340 + }, + { + "epoch": 2.9943249307113633, + "grad_norm": 0.0014544236473739147, + "learning_rate": 0.24430036650107223, + "loss": 0.1142, + "num_input_tokens_seen": 10262896, + "step": 11345 + }, + { + "epoch": 2.9956447142668603, + "grad_norm": 0.0011478194501250982, + "learning_rate": 0.2442545505156387, + "loss": 0.0936, + "num_input_tokens_seen": 10267504, + "step": 11350 + }, + { + "epoch": 2.9969644978223573, + "grad_norm": 0.0036519255954772234, + "learning_rate": 0.24420871999496904, + "loss": 0.12, + "num_input_tokens_seen": 10272240, + "step": 11355 + }, + { + "epoch": 2.9982842813778543, + "grad_norm": 0.0017762115458026528, + "learning_rate": 0.24416287494613084, + "loss": 0.0846, + "num_input_tokens_seen": 10276720, + "step": 11360 + }, + { + "epoch": 2.999604064933351, + "grad_norm": 0.0011431664461269975, + "learning_rate": 0.24411701537619399, + "loss": 0.0711, + "num_input_tokens_seen": 10281264, + "step": 11365 + }, + { + "epoch": 3.0007918701332983, + "grad_norm": 0.0014392149168998003, + "learning_rate": 0.24407114129223062, + "loss": 0.0602, + "num_input_tokens_seen": 10284960, + "step": 11370 + }, + { + "epoch": 3.002111653688795, + "grad_norm": 0.002130044624209404, + "learning_rate": 0.2440252527013151, + "loss": 0.1105, + "num_input_tokens_seen": 10289952, + "step": 11375 + }, + { + "epoch": 3.003431437244292, + "grad_norm": 0.003114310558885336, + "learning_rate": 0.24397934961052403, + "loss": 0.0856, + "num_input_tokens_seen": 10294336, + "step": 11380 + }, + { + "epoch": 3.004751220799789, + "grad_norm": 0.0020938124507665634, + "learning_rate": 0.24393343202693618, + "loss": 0.0667, + "num_input_tokens_seen": 10299200, + "step": 11385 + }, + { + "epoch": 3.006071004355286, + "grad_norm": 0.0024107517674565315, + "learning_rate": 0.2438874999576327, + "loss": 0.1257, + "num_input_tokens_seen": 10303616, + "step": 11390 + }, + { + "epoch": 3.007390787910783, + "grad_norm": 0.0034665013663470745, + "learning_rate": 0.24384155340969688, + "loss": 0.127, + "num_input_tokens_seen": 10308384, + "step": 11395 + }, + { + "epoch": 3.0087105714662794, + "grad_norm": 0.0017266892828047276, + "learning_rate": 0.24379559239021423, + "loss": 0.115, + "num_input_tokens_seen": 10312800, + "step": 11400 + }, + { + "epoch": 3.0087105714662794, + "eval_loss": 0.10218437761068344, + "eval_runtime": 75.8671, + "eval_samples_per_second": 88.774, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 10312800, + "step": 11400 + }, + { + "epoch": 3.0100303550217764, + "grad_norm": 0.0012891502119600773, + "learning_rate": 0.2437496169062725, + "loss": 0.0764, + "num_input_tokens_seen": 10317056, + "step": 11405 + }, + { + "epoch": 3.0113501385772734, + "grad_norm": 0.003076897468417883, + "learning_rate": 0.24370362696496176, + "loss": 0.0947, + "num_input_tokens_seen": 10321376, + "step": 11410 + }, + { + "epoch": 3.0126699221327704, + "grad_norm": 0.0022245331201702356, + "learning_rate": 0.24365762257337417, + "loss": 0.129, + "num_input_tokens_seen": 10325760, + "step": 11415 + }, + { + "epoch": 3.013989705688267, + "grad_norm": 0.001700722030363977, + "learning_rate": 0.2436116037386042, + "loss": 0.0743, + "num_input_tokens_seen": 10330144, + "step": 11420 + }, + { + "epoch": 3.015309489243764, + "grad_norm": 0.0028207614086568356, + "learning_rate": 0.24356557046774852, + "loss": 0.1025, + "num_input_tokens_seen": 10334944, + "step": 11425 + }, + { + "epoch": 3.016629272799261, + "grad_norm": 0.0019418291049078107, + "learning_rate": 0.24351952276790606, + "loss": 0.0753, + "num_input_tokens_seen": 10339232, + "step": 11430 + }, + { + "epoch": 3.017949056354758, + "grad_norm": 0.0012426214525476098, + "learning_rate": 0.24347346064617797, + "loss": 0.0588, + "num_input_tokens_seen": 10343808, + "step": 11435 + }, + { + "epoch": 3.0192688399102545, + "grad_norm": 0.0024892636574804783, + "learning_rate": 0.24342738410966758, + "loss": 0.0873, + "num_input_tokens_seen": 10348288, + "step": 11440 + }, + { + "epoch": 3.0205886234657515, + "grad_norm": 0.0027223932556807995, + "learning_rate": 0.24338129316548046, + "loss": 0.1, + "num_input_tokens_seen": 10352736, + "step": 11445 + }, + { + "epoch": 3.0219084070212485, + "grad_norm": 0.001395228668116033, + "learning_rate": 0.24333518782072444, + "loss": 0.0472, + "num_input_tokens_seen": 10357312, + "step": 11450 + }, + { + "epoch": 3.0232281905767455, + "grad_norm": 0.0033858211245387793, + "learning_rate": 0.24328906808250952, + "loss": 0.1156, + "num_input_tokens_seen": 10361696, + "step": 11455 + }, + { + "epoch": 3.0245479741322425, + "grad_norm": 0.002203123178333044, + "learning_rate": 0.243242933957948, + "loss": 0.0605, + "num_input_tokens_seen": 10366112, + "step": 11460 + }, + { + "epoch": 3.025867757687739, + "grad_norm": 0.002685386687517166, + "learning_rate": 0.24319678545415427, + "loss": 0.0938, + "num_input_tokens_seen": 10370720, + "step": 11465 + }, + { + "epoch": 3.027187541243236, + "grad_norm": 0.0023895723279565573, + "learning_rate": 0.24315062257824507, + "loss": 0.085, + "num_input_tokens_seen": 10375136, + "step": 11470 + }, + { + "epoch": 3.028507324798733, + "grad_norm": 0.0037809929344803095, + "learning_rate": 0.24310444533733921, + "loss": 0.1009, + "num_input_tokens_seen": 10379360, + "step": 11475 + }, + { + "epoch": 3.02982710835423, + "grad_norm": 0.0017828759737312794, + "learning_rate": 0.2430582537385579, + "loss": 0.0779, + "num_input_tokens_seen": 10384192, + "step": 11480 + }, + { + "epoch": 3.0311468919097266, + "grad_norm": 0.0034503370989114046, + "learning_rate": 0.2430120477890244, + "loss": 0.0964, + "num_input_tokens_seen": 10388672, + "step": 11485 + }, + { + "epoch": 3.0324666754652236, + "grad_norm": 0.0015951218083500862, + "learning_rate": 0.24296582749586426, + "loss": 0.0602, + "num_input_tokens_seen": 10393408, + "step": 11490 + }, + { + "epoch": 3.0337864590207206, + "grad_norm": 0.0024989487137645483, + "learning_rate": 0.24291959286620526, + "loss": 0.0639, + "num_input_tokens_seen": 10397632, + "step": 11495 + }, + { + "epoch": 3.0351062425762176, + "grad_norm": 0.0023775906302034855, + "learning_rate": 0.24287334390717738, + "loss": 0.0742, + "num_input_tokens_seen": 10402144, + "step": 11500 + }, + { + "epoch": 3.036426026131714, + "grad_norm": 0.0010555082699283957, + "learning_rate": 0.24282708062591268, + "loss": 0.0941, + "num_input_tokens_seen": 10406528, + "step": 11505 + }, + { + "epoch": 3.037745809687211, + "grad_norm": 0.0031134053133428097, + "learning_rate": 0.24278080302954563, + "loss": 0.1244, + "num_input_tokens_seen": 10411008, + "step": 11510 + }, + { + "epoch": 3.039065593242708, + "grad_norm": 0.001173855271190405, + "learning_rate": 0.24273451112521283, + "loss": 0.0984, + "num_input_tokens_seen": 10415552, + "step": 11515 + }, + { + "epoch": 3.040385376798205, + "grad_norm": 0.0026907988358289003, + "learning_rate": 0.242688204920053, + "loss": 0.0676, + "num_input_tokens_seen": 10420128, + "step": 11520 + }, + { + "epoch": 3.041705160353702, + "grad_norm": 0.0039562080055475235, + "learning_rate": 0.24264188442120715, + "loss": 0.1043, + "num_input_tokens_seen": 10424640, + "step": 11525 + }, + { + "epoch": 3.0430249439091988, + "grad_norm": 0.0017083793645724654, + "learning_rate": 0.24259554963581853, + "loss": 0.1139, + "num_input_tokens_seen": 10429056, + "step": 11530 + }, + { + "epoch": 3.0443447274646958, + "grad_norm": 0.004252574872225523, + "learning_rate": 0.24254920057103257, + "loss": 0.1224, + "num_input_tokens_seen": 10433792, + "step": 11535 + }, + { + "epoch": 3.0456645110201928, + "grad_norm": 0.0017575373640283942, + "learning_rate": 0.24250283723399685, + "loss": 0.0886, + "num_input_tokens_seen": 10438464, + "step": 11540 + }, + { + "epoch": 3.0469842945756898, + "grad_norm": 0.0023308570962399244, + "learning_rate": 0.24245645963186108, + "loss": 0.0766, + "num_input_tokens_seen": 10443072, + "step": 11545 + }, + { + "epoch": 3.0483040781311863, + "grad_norm": 0.0027223979122936726, + "learning_rate": 0.2424100677717774, + "loss": 0.1456, + "num_input_tokens_seen": 10447168, + "step": 11550 + }, + { + "epoch": 3.0496238616866833, + "grad_norm": 0.001436819089576602, + "learning_rate": 0.24236366166090004, + "loss": 0.0856, + "num_input_tokens_seen": 10451808, + "step": 11555 + }, + { + "epoch": 3.0509436452421803, + "grad_norm": 0.001939457724802196, + "learning_rate": 0.24231724130638527, + "loss": 0.0908, + "num_input_tokens_seen": 10456640, + "step": 11560 + }, + { + "epoch": 3.0522634287976773, + "grad_norm": 0.001731215976178646, + "learning_rate": 0.2422708067153917, + "loss": 0.0892, + "num_input_tokens_seen": 10461376, + "step": 11565 + }, + { + "epoch": 3.053583212353174, + "grad_norm": 0.0016200965037569404, + "learning_rate": 0.24222435789508026, + "loss": 0.1076, + "num_input_tokens_seen": 10465792, + "step": 11570 + }, + { + "epoch": 3.054902995908671, + "grad_norm": 0.002436818787828088, + "learning_rate": 0.24217789485261387, + "loss": 0.0806, + "num_input_tokens_seen": 10470304, + "step": 11575 + }, + { + "epoch": 3.056222779464168, + "grad_norm": 0.0017739413306117058, + "learning_rate": 0.2421314175951577, + "loss": 0.1108, + "num_input_tokens_seen": 10475072, + "step": 11580 + }, + { + "epoch": 3.057542563019665, + "grad_norm": 0.0012319646775722504, + "learning_rate": 0.2420849261298791, + "loss": 0.0615, + "num_input_tokens_seen": 10479712, + "step": 11585 + }, + { + "epoch": 3.058862346575162, + "grad_norm": 0.00233441567979753, + "learning_rate": 0.24203842046394775, + "loss": 0.1368, + "num_input_tokens_seen": 10483904, + "step": 11590 + }, + { + "epoch": 3.0601821301306584, + "grad_norm": 0.0028630835004150867, + "learning_rate": 0.24199190060453535, + "loss": 0.0945, + "num_input_tokens_seen": 10488320, + "step": 11595 + }, + { + "epoch": 3.0615019136861554, + "grad_norm": 0.0018235910683870316, + "learning_rate": 0.2419453665588158, + "loss": 0.1159, + "num_input_tokens_seen": 10492768, + "step": 11600 + }, + { + "epoch": 3.0615019136861554, + "eval_loss": 0.11583953350782394, + "eval_runtime": 75.8704, + "eval_samples_per_second": 88.77, + "eval_steps_per_second": 22.196, + "num_input_tokens_seen": 10492768, + "step": 11600 + }, + { + "epoch": 3.0628216972416524, + "grad_norm": 0.0018833389040082693, + "learning_rate": 0.24189881833396523, + "loss": 0.1007, + "num_input_tokens_seen": 10497408, + "step": 11605 + }, + { + "epoch": 3.0641414807971494, + "grad_norm": 0.0029834590386599302, + "learning_rate": 0.24185225593716203, + "loss": 0.0743, + "num_input_tokens_seen": 10501984, + "step": 11610 + }, + { + "epoch": 3.065461264352646, + "grad_norm": 0.002141958335414529, + "learning_rate": 0.2418056793755867, + "loss": 0.0779, + "num_input_tokens_seen": 10506528, + "step": 11615 + }, + { + "epoch": 3.066781047908143, + "grad_norm": 0.0016426609363406897, + "learning_rate": 0.24175908865642187, + "loss": 0.0646, + "num_input_tokens_seen": 10511072, + "step": 11620 + }, + { + "epoch": 3.06810083146364, + "grad_norm": 0.0013315345859155059, + "learning_rate": 0.24171248378685248, + "loss": 0.0746, + "num_input_tokens_seen": 10515712, + "step": 11625 + }, + { + "epoch": 3.069420615019137, + "grad_norm": 0.0014989608898758888, + "learning_rate": 0.24166586477406554, + "loss": 0.0598, + "num_input_tokens_seen": 10520160, + "step": 11630 + }, + { + "epoch": 3.0707403985746335, + "grad_norm": 0.002838312881067395, + "learning_rate": 0.24161923162525034, + "loss": 0.0809, + "num_input_tokens_seen": 10524416, + "step": 11635 + }, + { + "epoch": 3.0720601821301305, + "grad_norm": 0.00367749878205359, + "learning_rate": 0.2415725843475982, + "loss": 0.1309, + "num_input_tokens_seen": 10528576, + "step": 11640 + }, + { + "epoch": 3.0733799656856275, + "grad_norm": 0.0018920740112662315, + "learning_rate": 0.24152592294830286, + "loss": 0.0646, + "num_input_tokens_seen": 10533504, + "step": 11645 + }, + { + "epoch": 3.0746997492411245, + "grad_norm": 0.006051233503967524, + "learning_rate": 0.24147924743455995, + "loss": 0.1013, + "num_input_tokens_seen": 10538048, + "step": 11650 + }, + { + "epoch": 3.0760195327966215, + "grad_norm": 0.003654691157862544, + "learning_rate": 0.24143255781356754, + "loss": 0.1397, + "num_input_tokens_seen": 10542560, + "step": 11655 + }, + { + "epoch": 3.077339316352118, + "grad_norm": 0.002321887295693159, + "learning_rate": 0.24138585409252566, + "loss": 0.114, + "num_input_tokens_seen": 10547264, + "step": 11660 + }, + { + "epoch": 3.078659099907615, + "grad_norm": 0.002805352210998535, + "learning_rate": 0.24133913627863662, + "loss": 0.0987, + "num_input_tokens_seen": 10551744, + "step": 11665 + }, + { + "epoch": 3.079978883463112, + "grad_norm": 0.0018946710042655468, + "learning_rate": 0.241292404379105, + "loss": 0.1115, + "num_input_tokens_seen": 10556256, + "step": 11670 + }, + { + "epoch": 3.081298667018609, + "grad_norm": 0.0013771301601082087, + "learning_rate": 0.24124565840113735, + "loss": 0.0776, + "num_input_tokens_seen": 10560608, + "step": 11675 + }, + { + "epoch": 3.0826184505741057, + "grad_norm": 0.001872858963906765, + "learning_rate": 0.2411988983519425, + "loss": 0.0879, + "num_input_tokens_seen": 10565024, + "step": 11680 + }, + { + "epoch": 3.0839382341296027, + "grad_norm": 0.0020583407022058964, + "learning_rate": 0.24115212423873145, + "loss": 0.0877, + "num_input_tokens_seen": 10569440, + "step": 11685 + }, + { + "epoch": 3.0852580176850997, + "grad_norm": 0.0025166457053273916, + "learning_rate": 0.24110533606871737, + "loss": 0.0992, + "num_input_tokens_seen": 10574016, + "step": 11690 + }, + { + "epoch": 3.0865778012405967, + "grad_norm": 0.0030958889983594418, + "learning_rate": 0.24105853384911552, + "loss": 0.096, + "num_input_tokens_seen": 10578752, + "step": 11695 + }, + { + "epoch": 3.087897584796093, + "grad_norm": 0.0020417601335793734, + "learning_rate": 0.24101171758714346, + "loss": 0.0814, + "num_input_tokens_seen": 10583008, + "step": 11700 + }, + { + "epoch": 3.08921736835159, + "grad_norm": 0.003312816144898534, + "learning_rate": 0.24096488729002086, + "loss": 0.0655, + "num_input_tokens_seen": 10587520, + "step": 11705 + }, + { + "epoch": 3.090537151907087, + "grad_norm": 0.0022315308451652527, + "learning_rate": 0.24091804296496946, + "loss": 0.0685, + "num_input_tokens_seen": 10591968, + "step": 11710 + }, + { + "epoch": 3.091856935462584, + "grad_norm": 0.0017178432317450643, + "learning_rate": 0.2408711846192133, + "loss": 0.0879, + "num_input_tokens_seen": 10596416, + "step": 11715 + }, + { + "epoch": 3.0931767190180812, + "grad_norm": 0.002393729519098997, + "learning_rate": 0.24082431225997855, + "loss": 0.0768, + "num_input_tokens_seen": 10601216, + "step": 11720 + }, + { + "epoch": 3.094496502573578, + "grad_norm": 0.0025016723666340113, + "learning_rate": 0.24077742589449344, + "loss": 0.0975, + "num_input_tokens_seen": 10605728, + "step": 11725 + }, + { + "epoch": 3.095816286129075, + "grad_norm": 0.0037711341865360737, + "learning_rate": 0.24073052552998844, + "loss": 0.0904, + "num_input_tokens_seen": 10610240, + "step": 11730 + }, + { + "epoch": 3.097136069684572, + "grad_norm": 0.0020694660488516092, + "learning_rate": 0.2406836111736963, + "loss": 0.0851, + "num_input_tokens_seen": 10614944, + "step": 11735 + }, + { + "epoch": 3.098455853240069, + "grad_norm": 0.0025480722542852163, + "learning_rate": 0.2406366828328517, + "loss": 0.0658, + "num_input_tokens_seen": 10619296, + "step": 11740 + }, + { + "epoch": 3.0997756367955653, + "grad_norm": 0.0035174195654690266, + "learning_rate": 0.2405897405146915, + "loss": 0.0844, + "num_input_tokens_seen": 10623840, + "step": 11745 + }, + { + "epoch": 3.1010954203510623, + "grad_norm": 0.0009321732795797288, + "learning_rate": 0.240542784226455, + "loss": 0.0598, + "num_input_tokens_seen": 10628128, + "step": 11750 + }, + { + "epoch": 3.1024152039065593, + "grad_norm": 0.0027189881075173616, + "learning_rate": 0.24049581397538328, + "loss": 0.0999, + "num_input_tokens_seen": 10632544, + "step": 11755 + }, + { + "epoch": 3.1037349874620563, + "grad_norm": 0.0023663006722927094, + "learning_rate": 0.24044882976871984, + "loss": 0.0782, + "num_input_tokens_seen": 10637056, + "step": 11760 + }, + { + "epoch": 3.105054771017553, + "grad_norm": 0.002105704741552472, + "learning_rate": 0.2404018316137102, + "loss": 0.1065, + "num_input_tokens_seen": 10641696, + "step": 11765 + }, + { + "epoch": 3.10637455457305, + "grad_norm": 0.0015303944237530231, + "learning_rate": 0.24035481951760204, + "loss": 0.075, + "num_input_tokens_seen": 10646112, + "step": 11770 + }, + { + "epoch": 3.107694338128547, + "grad_norm": 0.0017887934809550643, + "learning_rate": 0.2403077934876452, + "loss": 0.1336, + "num_input_tokens_seen": 10650784, + "step": 11775 + }, + { + "epoch": 3.109014121684044, + "grad_norm": 0.002259206725284457, + "learning_rate": 0.2402607535310918, + "loss": 0.0757, + "num_input_tokens_seen": 10655392, + "step": 11780 + }, + { + "epoch": 3.110333905239541, + "grad_norm": 0.003343041520565748, + "learning_rate": 0.2402136996551959, + "loss": 0.113, + "num_input_tokens_seen": 10659616, + "step": 11785 + }, + { + "epoch": 3.1116536887950375, + "grad_norm": 0.002245477866381407, + "learning_rate": 0.24016663186721376, + "loss": 0.0657, + "num_input_tokens_seen": 10664416, + "step": 11790 + }, + { + "epoch": 3.1129734723505345, + "grad_norm": 0.0022959569469094276, + "learning_rate": 0.24011955017440395, + "loss": 0.0593, + "num_input_tokens_seen": 10668832, + "step": 11795 + }, + { + "epoch": 3.1142932559060315, + "grad_norm": 0.002098318887874484, + "learning_rate": 0.24007245458402696, + "loss": 0.1052, + "num_input_tokens_seen": 10673088, + "step": 11800 + }, + { + "epoch": 3.1142932559060315, + "eval_loss": 0.10483235865831375, + "eval_runtime": 75.8716, + "eval_samples_per_second": 88.768, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 10673088, + "step": 11800 + }, + { + "epoch": 3.1156130394615285, + "grad_norm": 0.003499183338135481, + "learning_rate": 0.2400253451033456, + "loss": 0.1042, + "num_input_tokens_seen": 10677728, + "step": 11805 + }, + { + "epoch": 3.116932823017025, + "grad_norm": 0.0028426682110875845, + "learning_rate": 0.23997822173962463, + "loss": 0.088, + "num_input_tokens_seen": 10682304, + "step": 11810 + }, + { + "epoch": 3.118252606572522, + "grad_norm": 0.001147415372543037, + "learning_rate": 0.23993108450013118, + "loss": 0.0667, + "num_input_tokens_seen": 10686656, + "step": 11815 + }, + { + "epoch": 3.119572390128019, + "grad_norm": 0.0013532049488276243, + "learning_rate": 0.2398839333921343, + "loss": 0.0915, + "num_input_tokens_seen": 10691296, + "step": 11820 + }, + { + "epoch": 3.120892173683516, + "grad_norm": 0.004025583155453205, + "learning_rate": 0.23983676842290536, + "loss": 0.0978, + "num_input_tokens_seen": 10695680, + "step": 11825 + }, + { + "epoch": 3.122211957239013, + "grad_norm": 0.002507096156477928, + "learning_rate": 0.2397895895997178, + "loss": 0.1043, + "num_input_tokens_seen": 10699904, + "step": 11830 + }, + { + "epoch": 3.1235317407945096, + "grad_norm": 0.001864190911874175, + "learning_rate": 0.23974239692984714, + "loss": 0.0754, + "num_input_tokens_seen": 10704320, + "step": 11835 + }, + { + "epoch": 3.1248515243500066, + "grad_norm": 0.00272544682957232, + "learning_rate": 0.2396951904205711, + "loss": 0.0675, + "num_input_tokens_seen": 10708864, + "step": 11840 + }, + { + "epoch": 3.1261713079055036, + "grad_norm": 0.0030390857718884945, + "learning_rate": 0.23964797007916952, + "loss": 0.0895, + "num_input_tokens_seen": 10713120, + "step": 11845 + }, + { + "epoch": 3.1274910914610006, + "grad_norm": 0.002643575891852379, + "learning_rate": 0.23960073591292436, + "loss": 0.1073, + "num_input_tokens_seen": 10717696, + "step": 11850 + }, + { + "epoch": 3.128810875016497, + "grad_norm": 0.002728400519117713, + "learning_rate": 0.2395534879291197, + "loss": 0.1107, + "num_input_tokens_seen": 10722688, + "step": 11855 + }, + { + "epoch": 3.130130658571994, + "grad_norm": 0.0013630251633003354, + "learning_rate": 0.23950622613504186, + "loss": 0.1052, + "num_input_tokens_seen": 10727264, + "step": 11860 + }, + { + "epoch": 3.131450442127491, + "grad_norm": 0.0023393433075398207, + "learning_rate": 0.2394589505379791, + "loss": 0.1091, + "num_input_tokens_seen": 10732032, + "step": 11865 + }, + { + "epoch": 3.132770225682988, + "grad_norm": 0.0018758575897663832, + "learning_rate": 0.23941166114522197, + "loss": 0.1244, + "num_input_tokens_seen": 10736544, + "step": 11870 + }, + { + "epoch": 3.1340900092384847, + "grad_norm": 0.0037669064477086067, + "learning_rate": 0.23936435796406308, + "loss": 0.0943, + "num_input_tokens_seen": 10741408, + "step": 11875 + }, + { + "epoch": 3.1354097927939817, + "grad_norm": 0.0016347371274605393, + "learning_rate": 0.23931704100179715, + "loss": 0.0784, + "num_input_tokens_seen": 10745920, + "step": 11880 + }, + { + "epoch": 3.1367295763494787, + "grad_norm": 0.0025095681194216013, + "learning_rate": 0.2392697102657211, + "loss": 0.0817, + "num_input_tokens_seen": 10750592, + "step": 11885 + }, + { + "epoch": 3.1380493599049757, + "grad_norm": 0.0035076006315648556, + "learning_rate": 0.23922236576313388, + "loss": 0.1211, + "num_input_tokens_seen": 10755168, + "step": 11890 + }, + { + "epoch": 3.1393691434604727, + "grad_norm": 0.001466701040044427, + "learning_rate": 0.2391750075013366, + "loss": 0.0786, + "num_input_tokens_seen": 10759680, + "step": 11895 + }, + { + "epoch": 3.1406889270159692, + "grad_norm": 0.0021732342429459095, + "learning_rate": 0.2391276354876326, + "loss": 0.0743, + "num_input_tokens_seen": 10764192, + "step": 11900 + }, + { + "epoch": 3.1420087105714662, + "grad_norm": 0.003125677350908518, + "learning_rate": 0.23908024972932707, + "loss": 0.1172, + "num_input_tokens_seen": 10768672, + "step": 11905 + }, + { + "epoch": 3.1433284941269632, + "grad_norm": 0.0014443860854953527, + "learning_rate": 0.2390328502337276, + "loss": 0.0518, + "num_input_tokens_seen": 10772864, + "step": 11910 + }, + { + "epoch": 3.1446482776824602, + "grad_norm": 0.0027959109283983707, + "learning_rate": 0.23898543700814376, + "loss": 0.125, + "num_input_tokens_seen": 10777088, + "step": 11915 + }, + { + "epoch": 3.145968061237957, + "grad_norm": 0.002945191692560911, + "learning_rate": 0.2389380100598873, + "loss": 0.1125, + "num_input_tokens_seen": 10781888, + "step": 11920 + }, + { + "epoch": 3.147287844793454, + "grad_norm": 0.003412553807720542, + "learning_rate": 0.23889056939627207, + "loss": 0.0717, + "num_input_tokens_seen": 10786496, + "step": 11925 + }, + { + "epoch": 3.148607628348951, + "grad_norm": 0.0034163594245910645, + "learning_rate": 0.23884311502461386, + "loss": 0.1199, + "num_input_tokens_seen": 10791200, + "step": 11930 + }, + { + "epoch": 3.149927411904448, + "grad_norm": 0.0028797280974686146, + "learning_rate": 0.23879564695223088, + "loss": 0.0816, + "num_input_tokens_seen": 10795840, + "step": 11935 + }, + { + "epoch": 3.1512471954599444, + "grad_norm": 0.00271726050414145, + "learning_rate": 0.23874816518644332, + "loss": 0.0973, + "num_input_tokens_seen": 10800320, + "step": 11940 + }, + { + "epoch": 3.1525669790154414, + "grad_norm": 0.0024865816812962294, + "learning_rate": 0.23870066973457335, + "loss": 0.0991, + "num_input_tokens_seen": 10804480, + "step": 11945 + }, + { + "epoch": 3.1538867625709384, + "grad_norm": 0.0011735132429748774, + "learning_rate": 0.23865316060394545, + "loss": 0.0593, + "num_input_tokens_seen": 10808832, + "step": 11950 + }, + { + "epoch": 3.1552065461264354, + "grad_norm": 0.003280996112152934, + "learning_rate": 0.2386056378018861, + "loss": 0.0889, + "num_input_tokens_seen": 10813248, + "step": 11955 + }, + { + "epoch": 3.1565263296819324, + "grad_norm": 0.0009813716169446707, + "learning_rate": 0.2385581013357239, + "loss": 0.096, + "num_input_tokens_seen": 10817632, + "step": 11960 + }, + { + "epoch": 3.157846113237429, + "grad_norm": 0.003451112424954772, + "learning_rate": 0.23851055121278958, + "loss": 0.1047, + "num_input_tokens_seen": 10822336, + "step": 11965 + }, + { + "epoch": 3.159165896792926, + "grad_norm": 0.0015747229335829616, + "learning_rate": 0.23846298744041594, + "loss": 0.0677, + "num_input_tokens_seen": 10827008, + "step": 11970 + }, + { + "epoch": 3.160485680348423, + "grad_norm": 0.0014960860135033727, + "learning_rate": 0.23841541002593802, + "loss": 0.0844, + "num_input_tokens_seen": 10831616, + "step": 11975 + }, + { + "epoch": 3.16180546390392, + "grad_norm": 0.0017135579837486148, + "learning_rate": 0.23836781897669276, + "loss": 0.082, + "num_input_tokens_seen": 10836000, + "step": 11980 + }, + { + "epoch": 3.1631252474594165, + "grad_norm": 0.0006260402733460069, + "learning_rate": 0.23832021430001926, + "loss": 0.1118, + "num_input_tokens_seen": 10840544, + "step": 11985 + }, + { + "epoch": 3.1644450310149135, + "grad_norm": 0.0017869556322693825, + "learning_rate": 0.2382725960032588, + "loss": 0.044, + "num_input_tokens_seen": 10845280, + "step": 11990 + }, + { + "epoch": 3.1657648145704105, + "grad_norm": 0.004728809930384159, + "learning_rate": 0.23822496409375482, + "loss": 0.1136, + "num_input_tokens_seen": 10849920, + "step": 11995 + }, + { + "epoch": 3.1670845981259075, + "grad_norm": 0.0012566912919282913, + "learning_rate": 0.2381773185788526, + "loss": 0.0716, + "num_input_tokens_seen": 10854592, + "step": 12000 + }, + { + "epoch": 3.1670845981259075, + "eval_loss": 0.10795924812555313, + "eval_runtime": 75.8561, + "eval_samples_per_second": 88.787, + "eval_steps_per_second": 22.2, + "num_input_tokens_seen": 10854592, + "step": 12000 + }, + { + "epoch": 3.1684043816814045, + "grad_norm": 0.002424865495413542, + "learning_rate": 0.2381296594658998, + "loss": 0.0691, + "num_input_tokens_seen": 10859200, + "step": 12005 + }, + { + "epoch": 3.169724165236901, + "grad_norm": 0.003405128140002489, + "learning_rate": 0.238081986762246, + "loss": 0.0654, + "num_input_tokens_seen": 10863616, + "step": 12010 + }, + { + "epoch": 3.171043948792398, + "grad_norm": 0.002498301910236478, + "learning_rate": 0.23803430047524293, + "loss": 0.1314, + "num_input_tokens_seen": 10867968, + "step": 12015 + }, + { + "epoch": 3.172363732347895, + "grad_norm": 0.0027079633437097073, + "learning_rate": 0.23798660061224441, + "loss": 0.1131, + "num_input_tokens_seen": 10872544, + "step": 12020 + }, + { + "epoch": 3.173683515903392, + "grad_norm": 0.0025394067633897066, + "learning_rate": 0.23793888718060632, + "loss": 0.1381, + "num_input_tokens_seen": 10876928, + "step": 12025 + }, + { + "epoch": 3.1750032994588886, + "grad_norm": 0.002510787220671773, + "learning_rate": 0.23789116018768675, + "loss": 0.095, + "num_input_tokens_seen": 10881184, + "step": 12030 + }, + { + "epoch": 3.1763230830143856, + "grad_norm": 0.0029299191664904356, + "learning_rate": 0.2378434196408458, + "loss": 0.0754, + "num_input_tokens_seen": 10885952, + "step": 12035 + }, + { + "epoch": 3.1776428665698826, + "grad_norm": 0.0034411021042615175, + "learning_rate": 0.23779566554744563, + "loss": 0.0992, + "num_input_tokens_seen": 10890688, + "step": 12040 + }, + { + "epoch": 3.1789626501253796, + "grad_norm": 0.002697885036468506, + "learning_rate": 0.23774789791485051, + "loss": 0.1265, + "num_input_tokens_seen": 10895456, + "step": 12045 + }, + { + "epoch": 3.180282433680876, + "grad_norm": 0.0018552826950326562, + "learning_rate": 0.2377001167504268, + "loss": 0.0875, + "num_input_tokens_seen": 10899840, + "step": 12050 + }, + { + "epoch": 3.181602217236373, + "grad_norm": 0.004403938073664904, + "learning_rate": 0.23765232206154302, + "loss": 0.0659, + "num_input_tokens_seen": 10904320, + "step": 12055 + }, + { + "epoch": 3.18292200079187, + "grad_norm": 0.0024654872249811888, + "learning_rate": 0.23760451385556966, + "loss": 0.0975, + "num_input_tokens_seen": 10908896, + "step": 12060 + }, + { + "epoch": 3.184241784347367, + "grad_norm": 0.0027084422763437033, + "learning_rate": 0.23755669213987932, + "loss": 0.0999, + "num_input_tokens_seen": 10913376, + "step": 12065 + }, + { + "epoch": 3.185561567902864, + "grad_norm": 0.0053642671555280685, + "learning_rate": 0.23750885692184676, + "loss": 0.1229, + "num_input_tokens_seen": 10917824, + "step": 12070 + }, + { + "epoch": 3.1868813514583607, + "grad_norm": 0.0017240323359146714, + "learning_rate": 0.23746100820884875, + "loss": 0.0957, + "num_input_tokens_seen": 10922144, + "step": 12075 + }, + { + "epoch": 3.1882011350138577, + "grad_norm": 0.002611108124256134, + "learning_rate": 0.23741314600826421, + "loss": 0.0949, + "num_input_tokens_seen": 10926304, + "step": 12080 + }, + { + "epoch": 3.1895209185693547, + "grad_norm": 0.002755651017650962, + "learning_rate": 0.23736527032747406, + "loss": 0.1166, + "num_input_tokens_seen": 10930944, + "step": 12085 + }, + { + "epoch": 3.1908407021248517, + "grad_norm": 0.003084976691752672, + "learning_rate": 0.23731738117386128, + "loss": 0.093, + "num_input_tokens_seen": 10935424, + "step": 12090 + }, + { + "epoch": 3.1921604856803483, + "grad_norm": 0.004149019252508879, + "learning_rate": 0.237269478554811, + "loss": 0.085, + "num_input_tokens_seen": 10940000, + "step": 12095 + }, + { + "epoch": 3.1934802692358453, + "grad_norm": 0.0019453888526186347, + "learning_rate": 0.23722156247771053, + "loss": 0.0767, + "num_input_tokens_seen": 10944448, + "step": 12100 + }, + { + "epoch": 3.1948000527913423, + "grad_norm": 0.0018261547666043043, + "learning_rate": 0.23717363294994895, + "loss": 0.087, + "num_input_tokens_seen": 10948832, + "step": 12105 + }, + { + "epoch": 3.1961198363468393, + "grad_norm": 0.001256727846339345, + "learning_rate": 0.2371256899789177, + "loss": 0.0845, + "num_input_tokens_seen": 10953344, + "step": 12110 + }, + { + "epoch": 3.197439619902336, + "grad_norm": 0.0030629283282905817, + "learning_rate": 0.23707773357201017, + "loss": 0.1047, + "num_input_tokens_seen": 10957920, + "step": 12115 + }, + { + "epoch": 3.198759403457833, + "grad_norm": 0.0028406959027051926, + "learning_rate": 0.2370297637366218, + "loss": 0.0646, + "num_input_tokens_seen": 10962848, + "step": 12120 + }, + { + "epoch": 3.20007918701333, + "grad_norm": 0.0018858765251934528, + "learning_rate": 0.23698178048015026, + "loss": 0.0454, + "num_input_tokens_seen": 10967392, + "step": 12125 + }, + { + "epoch": 3.201398970568827, + "grad_norm": 0.0017247885698452592, + "learning_rate": 0.236933783809995, + "loss": 0.1544, + "num_input_tokens_seen": 10971936, + "step": 12130 + }, + { + "epoch": 3.202718754124324, + "grad_norm": 0.0014637020649388433, + "learning_rate": 0.23688577373355785, + "loss": 0.1349, + "num_input_tokens_seen": 10976576, + "step": 12135 + }, + { + "epoch": 3.2040385376798204, + "grad_norm": 0.002526928437873721, + "learning_rate": 0.23683775025824247, + "loss": 0.1026, + "num_input_tokens_seen": 10981056, + "step": 12140 + }, + { + "epoch": 3.2053583212353174, + "grad_norm": 0.005180693231523037, + "learning_rate": 0.2367897133914548, + "loss": 0.1311, + "num_input_tokens_seen": 10985824, + "step": 12145 + }, + { + "epoch": 3.2066781047908144, + "grad_norm": 0.0019483508076518774, + "learning_rate": 0.2367416631406026, + "loss": 0.125, + "num_input_tokens_seen": 10990208, + "step": 12150 + }, + { + "epoch": 3.2079978883463114, + "grad_norm": 0.0025477290619164705, + "learning_rate": 0.23669359951309588, + "loss": 0.0818, + "num_input_tokens_seen": 10994784, + "step": 12155 + }, + { + "epoch": 3.209317671901808, + "grad_norm": 0.0022498329635709524, + "learning_rate": 0.23664552251634666, + "loss": 0.1036, + "num_input_tokens_seen": 10999328, + "step": 12160 + }, + { + "epoch": 3.210637455457305, + "grad_norm": 0.0021111550740897655, + "learning_rate": 0.23659743215776907, + "loss": 0.1268, + "num_input_tokens_seen": 11003616, + "step": 12165 + }, + { + "epoch": 3.211957239012802, + "grad_norm": 0.002123224316164851, + "learning_rate": 0.23654932844477908, + "loss": 0.1036, + "num_input_tokens_seen": 11008256, + "step": 12170 + }, + { + "epoch": 3.213277022568299, + "grad_norm": 0.0021040982101112604, + "learning_rate": 0.23650121138479507, + "loss": 0.108, + "num_input_tokens_seen": 11012608, + "step": 12175 + }, + { + "epoch": 3.2145968061237955, + "grad_norm": 0.0015913508832454681, + "learning_rate": 0.23645308098523724, + "loss": 0.0859, + "num_input_tokens_seen": 11017408, + "step": 12180 + }, + { + "epoch": 3.2159165896792925, + "grad_norm": 0.001989200245589018, + "learning_rate": 0.23640493725352785, + "loss": 0.0834, + "num_input_tokens_seen": 11022016, + "step": 12185 + }, + { + "epoch": 3.2172363732347895, + "grad_norm": 0.0023895909544080496, + "learning_rate": 0.2363567801970913, + "loss": 0.1311, + "num_input_tokens_seen": 11026496, + "step": 12190 + }, + { + "epoch": 3.2185561567902865, + "grad_norm": 0.0016515256138518453, + "learning_rate": 0.236308609823354, + "loss": 0.0972, + "num_input_tokens_seen": 11031168, + "step": 12195 + }, + { + "epoch": 3.2198759403457835, + "grad_norm": 0.0004846701631322503, + "learning_rate": 0.23626042613974452, + "loss": 0.0559, + "num_input_tokens_seen": 11035424, + "step": 12200 + }, + { + "epoch": 3.2198759403457835, + "eval_loss": 0.11738286167383194, + "eval_runtime": 75.8647, + "eval_samples_per_second": 88.776, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 11035424, + "step": 12200 + }, + { + "epoch": 3.22119572390128, + "grad_norm": 0.004054889548569918, + "learning_rate": 0.23621222915369325, + "loss": 0.115, + "num_input_tokens_seen": 11039584, + "step": 12205 + }, + { + "epoch": 3.222515507456777, + "grad_norm": 0.0026863885577768087, + "learning_rate": 0.23616401887263283, + "loss": 0.0858, + "num_input_tokens_seen": 11044320, + "step": 12210 + }, + { + "epoch": 3.223835291012274, + "grad_norm": 0.001692016958259046, + "learning_rate": 0.23611579530399793, + "loss": 0.0479, + "num_input_tokens_seen": 11049088, + "step": 12215 + }, + { + "epoch": 3.225155074567771, + "grad_norm": 0.00386865739710629, + "learning_rate": 0.23606755845522517, + "loss": 0.0894, + "num_input_tokens_seen": 11053568, + "step": 12220 + }, + { + "epoch": 3.2264748581232676, + "grad_norm": 0.001959246350452304, + "learning_rate": 0.23601930833375329, + "loss": 0.0975, + "num_input_tokens_seen": 11058016, + "step": 12225 + }, + { + "epoch": 3.2277946416787646, + "grad_norm": 0.002311341930180788, + "learning_rate": 0.23597104494702312, + "loss": 0.0988, + "num_input_tokens_seen": 11062560, + "step": 12230 + }, + { + "epoch": 3.2291144252342616, + "grad_norm": 0.00227008992806077, + "learning_rate": 0.23592276830247744, + "loss": 0.1303, + "num_input_tokens_seen": 11066880, + "step": 12235 + }, + { + "epoch": 3.2304342087897586, + "grad_norm": 0.0020076599903404713, + "learning_rate": 0.2358744784075611, + "loss": 0.0644, + "num_input_tokens_seen": 11071424, + "step": 12240 + }, + { + "epoch": 3.231753992345255, + "grad_norm": 0.0018581822514533997, + "learning_rate": 0.235826175269721, + "loss": 0.1145, + "num_input_tokens_seen": 11075872, + "step": 12245 + }, + { + "epoch": 3.233073775900752, + "grad_norm": 0.0013398687588050961, + "learning_rate": 0.23577785889640612, + "loss": 0.0763, + "num_input_tokens_seen": 11080480, + "step": 12250 + }, + { + "epoch": 3.234393559456249, + "grad_norm": 0.003487294539809227, + "learning_rate": 0.23572952929506744, + "loss": 0.1135, + "num_input_tokens_seen": 11084992, + "step": 12255 + }, + { + "epoch": 3.235713343011746, + "grad_norm": 0.003507607616484165, + "learning_rate": 0.23568118647315803, + "loss": 0.1143, + "num_input_tokens_seen": 11089408, + "step": 12260 + }, + { + "epoch": 3.237033126567243, + "grad_norm": 0.0007783446344546974, + "learning_rate": 0.23563283043813296, + "loss": 0.0967, + "num_input_tokens_seen": 11093632, + "step": 12265 + }, + { + "epoch": 3.2383529101227397, + "grad_norm": 0.002812798600643873, + "learning_rate": 0.23558446119744922, + "loss": 0.0883, + "num_input_tokens_seen": 11098144, + "step": 12270 + }, + { + "epoch": 3.2396726936782367, + "grad_norm": 0.002482658950611949, + "learning_rate": 0.23553607875856608, + "loss": 0.0643, + "num_input_tokens_seen": 11102944, + "step": 12275 + }, + { + "epoch": 3.2409924772337337, + "grad_norm": 0.0037939180620014668, + "learning_rate": 0.2354876831289447, + "loss": 0.1426, + "num_input_tokens_seen": 11107808, + "step": 12280 + }, + { + "epoch": 3.2423122607892307, + "grad_norm": 0.0026992070488631725, + "learning_rate": 0.23543927431604827, + "loss": 0.1066, + "num_input_tokens_seen": 11112736, + "step": 12285 + }, + { + "epoch": 3.2436320443447273, + "grad_norm": 0.0024082865566015244, + "learning_rate": 0.23539085232734203, + "loss": 0.0836, + "num_input_tokens_seen": 11117184, + "step": 12290 + }, + { + "epoch": 3.2449518279002243, + "grad_norm": 0.0012940652668476105, + "learning_rate": 0.2353424171702933, + "loss": 0.0559, + "num_input_tokens_seen": 11121504, + "step": 12295 + }, + { + "epoch": 3.2462716114557213, + "grad_norm": 0.003505986649543047, + "learning_rate": 0.23529396885237133, + "loss": 0.1334, + "num_input_tokens_seen": 11125856, + "step": 12300 + }, + { + "epoch": 3.2475913950112183, + "grad_norm": 0.001620254130102694, + "learning_rate": 0.2352455073810475, + "loss": 0.1069, + "num_input_tokens_seen": 11130496, + "step": 12305 + }, + { + "epoch": 3.248911178566715, + "grad_norm": 0.0022817165590822697, + "learning_rate": 0.23519703276379517, + "loss": 0.1119, + "num_input_tokens_seen": 11134848, + "step": 12310 + }, + { + "epoch": 3.250230962122212, + "grad_norm": 0.0010427322704344988, + "learning_rate": 0.2351485450080897, + "loss": 0.1024, + "num_input_tokens_seen": 11139584, + "step": 12315 + }, + { + "epoch": 3.251550745677709, + "grad_norm": 0.0023409307468682528, + "learning_rate": 0.2351000441214086, + "loss": 0.1149, + "num_input_tokens_seen": 11144160, + "step": 12320 + }, + { + "epoch": 3.252870529233206, + "grad_norm": 0.0019184891134500504, + "learning_rate": 0.23505153011123125, + "loss": 0.0988, + "num_input_tokens_seen": 11148928, + "step": 12325 + }, + { + "epoch": 3.254190312788703, + "grad_norm": 0.0018542781472206116, + "learning_rate": 0.23500300298503912, + "loss": 0.1086, + "num_input_tokens_seen": 11153472, + "step": 12330 + }, + { + "epoch": 3.2555100963441994, + "grad_norm": 0.001490549766458571, + "learning_rate": 0.23495446275031576, + "loss": 0.0762, + "num_input_tokens_seen": 11158336, + "step": 12335 + }, + { + "epoch": 3.2568298798996964, + "grad_norm": 0.0026542004197835922, + "learning_rate": 0.2349059094145466, + "loss": 0.0707, + "num_input_tokens_seen": 11162656, + "step": 12340 + }, + { + "epoch": 3.2581496634551934, + "grad_norm": 0.002954792696982622, + "learning_rate": 0.2348573429852192, + "loss": 0.1182, + "num_input_tokens_seen": 11167200, + "step": 12345 + }, + { + "epoch": 3.2594694470106904, + "grad_norm": 0.0021394342184066772, + "learning_rate": 0.23480876346982313, + "loss": 0.1077, + "num_input_tokens_seen": 11171552, + "step": 12350 + }, + { + "epoch": 3.260789230566187, + "grad_norm": 0.0019153882749378681, + "learning_rate": 0.23476017087585, + "loss": 0.1219, + "num_input_tokens_seen": 11176000, + "step": 12355 + }, + { + "epoch": 3.262109014121684, + "grad_norm": 0.0034223708789795637, + "learning_rate": 0.23471156521079334, + "loss": 0.1004, + "num_input_tokens_seen": 11180544, + "step": 12360 + }, + { + "epoch": 3.263428797677181, + "grad_norm": 0.0007042933139018714, + "learning_rate": 0.23466294648214875, + "loss": 0.0722, + "num_input_tokens_seen": 11185152, + "step": 12365 + }, + { + "epoch": 3.264748581232678, + "grad_norm": 0.0032468910794705153, + "learning_rate": 0.2346143146974139, + "loss": 0.1208, + "num_input_tokens_seen": 11189696, + "step": 12370 + }, + { + "epoch": 3.2660683647881745, + "grad_norm": 0.0015040207654237747, + "learning_rate": 0.23456566986408836, + "loss": 0.0925, + "num_input_tokens_seen": 11194400, + "step": 12375 + }, + { + "epoch": 3.2673881483436715, + "grad_norm": 0.0025631680618971586, + "learning_rate": 0.23451701198967384, + "loss": 0.087, + "num_input_tokens_seen": 11198912, + "step": 12380 + }, + { + "epoch": 3.2687079318991685, + "grad_norm": 0.0018176556332036853, + "learning_rate": 0.23446834108167397, + "loss": 0.0678, + "num_input_tokens_seen": 11204000, + "step": 12385 + }, + { + "epoch": 3.2700277154546655, + "grad_norm": 0.003167872317135334, + "learning_rate": 0.23441965714759438, + "loss": 0.1002, + "num_input_tokens_seen": 11208416, + "step": 12390 + }, + { + "epoch": 3.2713474990101625, + "grad_norm": 0.001756807672791183, + "learning_rate": 0.23437096019494277, + "loss": 0.077, + "num_input_tokens_seen": 11213024, + "step": 12395 + }, + { + "epoch": 3.272667282565659, + "grad_norm": 0.0021690442226827145, + "learning_rate": 0.23432225023122885, + "loss": 0.0517, + "num_input_tokens_seen": 11217728, + "step": 12400 + }, + { + "epoch": 3.272667282565659, + "eval_loss": 0.1050054207444191, + "eval_runtime": 75.9037, + "eval_samples_per_second": 88.731, + "eval_steps_per_second": 22.186, + "num_input_tokens_seen": 11217728, + "step": 12400 + }, + { + "epoch": 3.273987066121156, + "grad_norm": 0.002247509080916643, + "learning_rate": 0.23427352726396428, + "loss": 0.0989, + "num_input_tokens_seen": 11222624, + "step": 12405 + }, + { + "epoch": 3.275306849676653, + "grad_norm": 0.002169666113331914, + "learning_rate": 0.2342247913006628, + "loss": 0.1043, + "num_input_tokens_seen": 11226848, + "step": 12410 + }, + { + "epoch": 3.27662663323215, + "grad_norm": 0.001040325965732336, + "learning_rate": 0.23417604234883999, + "loss": 0.1001, + "num_input_tokens_seen": 11231424, + "step": 12415 + }, + { + "epoch": 3.2779464167876466, + "grad_norm": 0.0016599268419668078, + "learning_rate": 0.23412728041601363, + "loss": 0.0877, + "num_input_tokens_seen": 11235776, + "step": 12420 + }, + { + "epoch": 3.2792662003431436, + "grad_norm": 0.0012450566282495856, + "learning_rate": 0.23407850550970347, + "loss": 0.0909, + "num_input_tokens_seen": 11240352, + "step": 12425 + }, + { + "epoch": 3.2805859838986406, + "grad_norm": 0.0029778152238577604, + "learning_rate": 0.23402971763743116, + "loss": 0.1159, + "num_input_tokens_seen": 11244672, + "step": 12430 + }, + { + "epoch": 3.2819057674541376, + "grad_norm": 0.0019407568033784628, + "learning_rate": 0.23398091680672037, + "loss": 0.0945, + "num_input_tokens_seen": 11249344, + "step": 12435 + }, + { + "epoch": 3.283225551009634, + "grad_norm": 0.002033522352576256, + "learning_rate": 0.23393210302509687, + "loss": 0.0949, + "num_input_tokens_seen": 11254080, + "step": 12440 + }, + { + "epoch": 3.284545334565131, + "grad_norm": 0.0024373484775424004, + "learning_rate": 0.23388327630008832, + "loss": 0.0763, + "num_input_tokens_seen": 11258368, + "step": 12445 + }, + { + "epoch": 3.285865118120628, + "grad_norm": 0.0033614772837609053, + "learning_rate": 0.23383443663922443, + "loss": 0.0956, + "num_input_tokens_seen": 11262816, + "step": 12450 + }, + { + "epoch": 3.287184901676125, + "grad_norm": 0.001160189975053072, + "learning_rate": 0.23378558405003685, + "loss": 0.1014, + "num_input_tokens_seen": 11267232, + "step": 12455 + }, + { + "epoch": 3.288504685231622, + "grad_norm": 0.0026891210582107306, + "learning_rate": 0.2337367185400593, + "loss": 0.0939, + "num_input_tokens_seen": 11271776, + "step": 12460 + }, + { + "epoch": 3.2898244687871188, + "grad_norm": 0.0036675792653113604, + "learning_rate": 0.23368784011682747, + "loss": 0.102, + "num_input_tokens_seen": 11276160, + "step": 12465 + }, + { + "epoch": 3.2911442523426158, + "grad_norm": 0.003397448919713497, + "learning_rate": 0.23363894878787902, + "loss": 0.1203, + "num_input_tokens_seen": 11280608, + "step": 12470 + }, + { + "epoch": 3.2924640358981128, + "grad_norm": 0.0020195464603602886, + "learning_rate": 0.23359004456075352, + "loss": 0.0909, + "num_input_tokens_seen": 11285024, + "step": 12475 + }, + { + "epoch": 3.2937838194536098, + "grad_norm": 0.0021917864214628935, + "learning_rate": 0.23354112744299277, + "loss": 0.1296, + "num_input_tokens_seen": 11289920, + "step": 12480 + }, + { + "epoch": 3.2951036030091063, + "grad_norm": 0.0033845724537968636, + "learning_rate": 0.2334921974421403, + "loss": 0.1149, + "num_input_tokens_seen": 11294176, + "step": 12485 + }, + { + "epoch": 3.2964233865646033, + "grad_norm": 0.002031842013821006, + "learning_rate": 0.23344325456574178, + "loss": 0.1264, + "num_input_tokens_seen": 11298656, + "step": 12490 + }, + { + "epoch": 3.2977431701201003, + "grad_norm": 0.0016581905074417591, + "learning_rate": 0.23339429882134477, + "loss": 0.0898, + "num_input_tokens_seen": 11303072, + "step": 12495 + }, + { + "epoch": 3.2990629536755973, + "grad_norm": 0.0010022070491686463, + "learning_rate": 0.23334533021649884, + "loss": 0.0855, + "num_input_tokens_seen": 11307616, + "step": 12500 + }, + { + "epoch": 3.300382737231094, + "grad_norm": 0.0017200313741341233, + "learning_rate": 0.23329634875875566, + "loss": 0.1075, + "num_input_tokens_seen": 11312416, + "step": 12505 + }, + { + "epoch": 3.301702520786591, + "grad_norm": 0.0025349401403218508, + "learning_rate": 0.23324735445566874, + "loss": 0.0908, + "num_input_tokens_seen": 11316704, + "step": 12510 + }, + { + "epoch": 3.303022304342088, + "grad_norm": 0.001962512033060193, + "learning_rate": 0.2331983473147936, + "loss": 0.1161, + "num_input_tokens_seen": 11321152, + "step": 12515 + }, + { + "epoch": 3.304342087897585, + "grad_norm": 0.0016308831982314587, + "learning_rate": 0.23314932734368776, + "loss": 0.1023, + "num_input_tokens_seen": 11325632, + "step": 12520 + }, + { + "epoch": 3.305661871453082, + "grad_norm": 0.0026661844458431005, + "learning_rate": 0.2331002945499107, + "loss": 0.1088, + "num_input_tokens_seen": 11330144, + "step": 12525 + }, + { + "epoch": 3.3069816550085784, + "grad_norm": 0.0011866699205711484, + "learning_rate": 0.23305124894102397, + "loss": 0.1017, + "num_input_tokens_seen": 11334848, + "step": 12530 + }, + { + "epoch": 3.3083014385640754, + "grad_norm": 0.0011533258948475122, + "learning_rate": 0.23300219052459092, + "loss": 0.0797, + "num_input_tokens_seen": 11339488, + "step": 12535 + }, + { + "epoch": 3.3096212221195724, + "grad_norm": 0.0018138957675546408, + "learning_rate": 0.23295311930817708, + "loss": 0.1161, + "num_input_tokens_seen": 11344160, + "step": 12540 + }, + { + "epoch": 3.3109410056750694, + "grad_norm": 0.0014353971928358078, + "learning_rate": 0.23290403529934972, + "loss": 0.0882, + "num_input_tokens_seen": 11348960, + "step": 12545 + }, + { + "epoch": 3.312260789230566, + "grad_norm": 0.002793873893097043, + "learning_rate": 0.23285493850567832, + "loss": 0.1285, + "num_input_tokens_seen": 11353408, + "step": 12550 + }, + { + "epoch": 3.313580572786063, + "grad_norm": 0.0008117372053675354, + "learning_rate": 0.23280582893473414, + "loss": 0.0733, + "num_input_tokens_seen": 11357824, + "step": 12555 + }, + { + "epoch": 3.31490035634156, + "grad_norm": 0.0014934036880731583, + "learning_rate": 0.2327567065940906, + "loss": 0.0857, + "num_input_tokens_seen": 11362528, + "step": 12560 + }, + { + "epoch": 3.316220139897057, + "grad_norm": 0.0012476970441639423, + "learning_rate": 0.23270757149132285, + "loss": 0.0739, + "num_input_tokens_seen": 11367232, + "step": 12565 + }, + { + "epoch": 3.3175399234525536, + "grad_norm": 0.0021237435285001993, + "learning_rate": 0.23265842363400827, + "loss": 0.0961, + "num_input_tokens_seen": 11371808, + "step": 12570 + }, + { + "epoch": 3.3188597070080506, + "grad_norm": 0.0007412621635012329, + "learning_rate": 0.23260926302972595, + "loss": 0.0502, + "num_input_tokens_seen": 11376192, + "step": 12575 + }, + { + "epoch": 3.3201794905635476, + "grad_norm": 0.001226572087034583, + "learning_rate": 0.2325600896860572, + "loss": 0.1223, + "num_input_tokens_seen": 11381088, + "step": 12580 + }, + { + "epoch": 3.3214992741190446, + "grad_norm": 0.003977896645665169, + "learning_rate": 0.23251090361058505, + "loss": 0.0768, + "num_input_tokens_seen": 11385856, + "step": 12585 + }, + { + "epoch": 3.3228190576745416, + "grad_norm": 0.0011014818446710706, + "learning_rate": 0.23246170481089476, + "loss": 0.0595, + "num_input_tokens_seen": 11390336, + "step": 12590 + }, + { + "epoch": 3.324138841230038, + "grad_norm": 0.0005413586623035371, + "learning_rate": 0.23241249329457317, + "loss": 0.0823, + "num_input_tokens_seen": 11395136, + "step": 12595 + }, + { + "epoch": 3.325458624785535, + "grad_norm": 0.002888278104364872, + "learning_rate": 0.23236326906920957, + "loss": 0.1134, + "num_input_tokens_seen": 11400032, + "step": 12600 + }, + { + "epoch": 3.325458624785535, + "eval_loss": 0.10288105905056, + "eval_runtime": 75.8116, + "eval_samples_per_second": 88.839, + "eval_steps_per_second": 22.213, + "num_input_tokens_seen": 11400032, + "step": 12600 + }, + { + "epoch": 3.326778408341032, + "grad_norm": 0.0018298582872375846, + "learning_rate": 0.2323140321423948, + "loss": 0.1262, + "num_input_tokens_seen": 11404608, + "step": 12605 + }, + { + "epoch": 3.328098191896529, + "grad_norm": 0.0015952156390994787, + "learning_rate": 0.23226478252172184, + "loss": 0.1013, + "num_input_tokens_seen": 11409120, + "step": 12610 + }, + { + "epoch": 3.329417975452026, + "grad_norm": 0.0006315935752354562, + "learning_rate": 0.23221552021478561, + "loss": 0.0892, + "num_input_tokens_seen": 11413728, + "step": 12615 + }, + { + "epoch": 3.3307377590075227, + "grad_norm": 0.002832988975569606, + "learning_rate": 0.232166245229183, + "loss": 0.1003, + "num_input_tokens_seen": 11418016, + "step": 12620 + }, + { + "epoch": 3.3320575425630197, + "grad_norm": 0.0023411796428263187, + "learning_rate": 0.2321169575725128, + "loss": 0.1332, + "num_input_tokens_seen": 11422560, + "step": 12625 + }, + { + "epoch": 3.3333773261185167, + "grad_norm": 0.002839853521436453, + "learning_rate": 0.23206765725237577, + "loss": 0.1003, + "num_input_tokens_seen": 11426976, + "step": 12630 + }, + { + "epoch": 3.3346971096740132, + "grad_norm": 0.002666645450517535, + "learning_rate": 0.2320183442763747, + "loss": 0.1324, + "num_input_tokens_seen": 11431360, + "step": 12635 + }, + { + "epoch": 3.3360168932295102, + "grad_norm": 0.001502795610576868, + "learning_rate": 0.23196901865211422, + "loss": 0.0501, + "num_input_tokens_seen": 11436384, + "step": 12640 + }, + { + "epoch": 3.3373366767850072, + "grad_norm": 0.0013419792521744967, + "learning_rate": 0.231919680387201, + "loss": 0.0688, + "num_input_tokens_seen": 11440608, + "step": 12645 + }, + { + "epoch": 3.3386564603405042, + "grad_norm": 0.0010302553419023752, + "learning_rate": 0.23187032948924358, + "loss": 0.0608, + "num_input_tokens_seen": 11445216, + "step": 12650 + }, + { + "epoch": 3.3399762438960012, + "grad_norm": 0.002064707223325968, + "learning_rate": 0.23182096596585247, + "loss": 0.0879, + "num_input_tokens_seen": 11449792, + "step": 12655 + }, + { + "epoch": 3.341296027451498, + "grad_norm": 0.0016727441688999534, + "learning_rate": 0.23177158982464025, + "loss": 0.0835, + "num_input_tokens_seen": 11454016, + "step": 12660 + }, + { + "epoch": 3.342615811006995, + "grad_norm": 0.002541447291150689, + "learning_rate": 0.23172220107322122, + "loss": 0.0775, + "num_input_tokens_seen": 11458368, + "step": 12665 + }, + { + "epoch": 3.343935594562492, + "grad_norm": 0.0037889296654611826, + "learning_rate": 0.23167279971921184, + "loss": 0.1224, + "num_input_tokens_seen": 11462624, + "step": 12670 + }, + { + "epoch": 3.345255378117989, + "grad_norm": 0.00231817364692688, + "learning_rate": 0.23162338577023034, + "loss": 0.1365, + "num_input_tokens_seen": 11467072, + "step": 12675 + }, + { + "epoch": 3.346575161673486, + "grad_norm": 0.0023703176993876696, + "learning_rate": 0.23157395923389704, + "loss": 0.0579, + "num_input_tokens_seen": 11471232, + "step": 12680 + }, + { + "epoch": 3.3478949452289823, + "grad_norm": 0.0016568932915106416, + "learning_rate": 0.2315245201178341, + "loss": 0.1134, + "num_input_tokens_seen": 11475808, + "step": 12685 + }, + { + "epoch": 3.3492147287844793, + "grad_norm": 0.0035965510178357363, + "learning_rate": 0.23147506842966564, + "loss": 0.0929, + "num_input_tokens_seen": 11480544, + "step": 12690 + }, + { + "epoch": 3.3505345123399763, + "grad_norm": 0.002644178457558155, + "learning_rate": 0.23142560417701774, + "loss": 0.0662, + "num_input_tokens_seen": 11485216, + "step": 12695 + }, + { + "epoch": 3.351854295895473, + "grad_norm": 0.002699841046705842, + "learning_rate": 0.23137612736751845, + "loss": 0.0941, + "num_input_tokens_seen": 11489504, + "step": 12700 + }, + { + "epoch": 3.35317407945097, + "grad_norm": 0.002752825850620866, + "learning_rate": 0.23132663800879766, + "loss": 0.067, + "num_input_tokens_seen": 11494240, + "step": 12705 + }, + { + "epoch": 3.354493863006467, + "grad_norm": 0.002719649113714695, + "learning_rate": 0.2312771361084873, + "loss": 0.0796, + "num_input_tokens_seen": 11499072, + "step": 12710 + }, + { + "epoch": 3.355813646561964, + "grad_norm": 0.0036576385609805584, + "learning_rate": 0.23122762167422112, + "loss": 0.0622, + "num_input_tokens_seen": 11503648, + "step": 12715 + }, + { + "epoch": 3.357133430117461, + "grad_norm": 0.0028093273285776377, + "learning_rate": 0.23117809471363493, + "loss": 0.0897, + "num_input_tokens_seen": 11508256, + "step": 12720 + }, + { + "epoch": 3.3584532136729575, + "grad_norm": 0.003177459817379713, + "learning_rate": 0.23112855523436637, + "loss": 0.0979, + "num_input_tokens_seen": 11513088, + "step": 12725 + }, + { + "epoch": 3.3597729972284545, + "grad_norm": 0.0013175331987440586, + "learning_rate": 0.23107900324405511, + "loss": 0.0937, + "num_input_tokens_seen": 11517600, + "step": 12730 + }, + { + "epoch": 3.3610927807839515, + "grad_norm": 0.0022665313445031643, + "learning_rate": 0.2310294387503426, + "loss": 0.08, + "num_input_tokens_seen": 11521856, + "step": 12735 + }, + { + "epoch": 3.3624125643394485, + "grad_norm": 0.0015262032393366098, + "learning_rate": 0.23097986176087237, + "loss": 0.0677, + "num_input_tokens_seen": 11526304, + "step": 12740 + }, + { + "epoch": 3.3637323478949455, + "grad_norm": 0.002429271349683404, + "learning_rate": 0.23093027228328986, + "loss": 0.0846, + "num_input_tokens_seen": 11531008, + "step": 12745 + }, + { + "epoch": 3.365052131450442, + "grad_norm": 0.002482820302248001, + "learning_rate": 0.23088067032524226, + "loss": 0.1269, + "num_input_tokens_seen": 11535872, + "step": 12750 + }, + { + "epoch": 3.366371915005939, + "grad_norm": 0.0030254630837589502, + "learning_rate": 0.23083105589437888, + "loss": 0.1144, + "num_input_tokens_seen": 11540448, + "step": 12755 + }, + { + "epoch": 3.367691698561436, + "grad_norm": 0.0022215491626411676, + "learning_rate": 0.23078142899835094, + "loss": 0.087, + "num_input_tokens_seen": 11544960, + "step": 12760 + }, + { + "epoch": 3.3690114821169326, + "grad_norm": 0.0018884026212617755, + "learning_rate": 0.23073178964481147, + "loss": 0.0841, + "num_input_tokens_seen": 11549728, + "step": 12765 + }, + { + "epoch": 3.3703312656724296, + "grad_norm": 0.002852835925295949, + "learning_rate": 0.2306821378414155, + "loss": 0.1024, + "num_input_tokens_seen": 11554048, + "step": 12770 + }, + { + "epoch": 3.3716510492279266, + "grad_norm": 0.0021201244089752436, + "learning_rate": 0.2306324735958199, + "loss": 0.0989, + "num_input_tokens_seen": 11558848, + "step": 12775 + }, + { + "epoch": 3.3729708327834236, + "grad_norm": 0.0025728484615683556, + "learning_rate": 0.23058279691568362, + "loss": 0.1187, + "num_input_tokens_seen": 11562976, + "step": 12780 + }, + { + "epoch": 3.3742906163389206, + "grad_norm": 0.0031672969926148653, + "learning_rate": 0.23053310780866745, + "loss": 0.1153, + "num_input_tokens_seen": 11567424, + "step": 12785 + }, + { + "epoch": 3.375610399894417, + "grad_norm": 0.0026716093998402357, + "learning_rate": 0.23048340628243397, + "loss": 0.114, + "num_input_tokens_seen": 11571648, + "step": 12790 + }, + { + "epoch": 3.376930183449914, + "grad_norm": 0.0027578494045883417, + "learning_rate": 0.23043369234464783, + "loss": 0.1031, + "num_input_tokens_seen": 11575904, + "step": 12795 + }, + { + "epoch": 3.378249967005411, + "grad_norm": 0.0036830753087997437, + "learning_rate": 0.2303839660029755, + "loss": 0.1257, + "num_input_tokens_seen": 11580736, + "step": 12800 + }, + { + "epoch": 3.378249967005411, + "eval_loss": 0.11157433688640594, + "eval_runtime": 75.878, + "eval_samples_per_second": 88.761, + "eval_steps_per_second": 22.194, + "num_input_tokens_seen": 11580736, + "step": 12800 + }, + { + "epoch": 3.379569750560908, + "grad_norm": 0.002387978369370103, + "learning_rate": 0.23033422726508548, + "loss": 0.0946, + "num_input_tokens_seen": 11585280, + "step": 12805 + }, + { + "epoch": 3.380889534116405, + "grad_norm": 0.002204088494181633, + "learning_rate": 0.23028447613864808, + "loss": 0.1141, + "num_input_tokens_seen": 11589664, + "step": 12810 + }, + { + "epoch": 3.3822093176719017, + "grad_norm": 0.0018937689019367099, + "learning_rate": 0.2302347126313355, + "loss": 0.1, + "num_input_tokens_seen": 11593952, + "step": 12815 + }, + { + "epoch": 3.3835291012273987, + "grad_norm": 0.0012559279566630721, + "learning_rate": 0.23018493675082197, + "loss": 0.0637, + "num_input_tokens_seen": 11598784, + "step": 12820 + }, + { + "epoch": 3.3848488847828957, + "grad_norm": 0.0027142600156366825, + "learning_rate": 0.2301351485047835, + "loss": 0.0963, + "num_input_tokens_seen": 11603552, + "step": 12825 + }, + { + "epoch": 3.3861686683383927, + "grad_norm": 0.0023666475899517536, + "learning_rate": 0.23008534790089813, + "loss": 0.1214, + "num_input_tokens_seen": 11608320, + "step": 12830 + }, + { + "epoch": 3.3874884518938893, + "grad_norm": 0.0012543154880404472, + "learning_rate": 0.2300355349468457, + "loss": 0.1026, + "num_input_tokens_seen": 11613216, + "step": 12835 + }, + { + "epoch": 3.3888082354493863, + "grad_norm": 0.00318295625038445, + "learning_rate": 0.22998570965030793, + "loss": 0.0816, + "num_input_tokens_seen": 11617792, + "step": 12840 + }, + { + "epoch": 3.3901280190048833, + "grad_norm": 0.0021572464611381292, + "learning_rate": 0.22993587201896862, + "loss": 0.1022, + "num_input_tokens_seen": 11621728, + "step": 12845 + }, + { + "epoch": 3.3914478025603803, + "grad_norm": 0.0022635196801275015, + "learning_rate": 0.2298860220605133, + "loss": 0.0868, + "num_input_tokens_seen": 11626240, + "step": 12850 + }, + { + "epoch": 3.392767586115877, + "grad_norm": 0.001964057795703411, + "learning_rate": 0.22983615978262942, + "loss": 0.1069, + "num_input_tokens_seen": 11630592, + "step": 12855 + }, + { + "epoch": 3.394087369671374, + "grad_norm": 0.002482770010828972, + "learning_rate": 0.22978628519300648, + "loss": 0.1106, + "num_input_tokens_seen": 11635264, + "step": 12860 + }, + { + "epoch": 3.395407153226871, + "grad_norm": 0.001796905999071896, + "learning_rate": 0.22973639829933568, + "loss": 0.0843, + "num_input_tokens_seen": 11639744, + "step": 12865 + }, + { + "epoch": 3.396726936782368, + "grad_norm": 0.00158801453653723, + "learning_rate": 0.22968649910931027, + "loss": 0.0969, + "num_input_tokens_seen": 11644064, + "step": 12870 + }, + { + "epoch": 3.398046720337865, + "grad_norm": 0.0017599144484847784, + "learning_rate": 0.22963658763062528, + "loss": 0.1157, + "num_input_tokens_seen": 11648320, + "step": 12875 + }, + { + "epoch": 3.3993665038933614, + "grad_norm": 0.0024219707120209932, + "learning_rate": 0.22958666387097765, + "loss": 0.0931, + "num_input_tokens_seen": 11653024, + "step": 12880 + }, + { + "epoch": 3.4006862874488584, + "grad_norm": 0.0019045824883505702, + "learning_rate": 0.22953672783806633, + "loss": 0.0813, + "num_input_tokens_seen": 11657440, + "step": 12885 + }, + { + "epoch": 3.4020060710043554, + "grad_norm": 0.00042148225475102663, + "learning_rate": 0.22948677953959207, + "loss": 0.0756, + "num_input_tokens_seen": 11662272, + "step": 12890 + }, + { + "epoch": 3.4033258545598524, + "grad_norm": 0.002108012093231082, + "learning_rate": 0.2294368189832575, + "loss": 0.0942, + "num_input_tokens_seen": 11667072, + "step": 12895 + }, + { + "epoch": 3.404645638115349, + "grad_norm": 0.0015500432346016169, + "learning_rate": 0.2293868461767672, + "loss": 0.1021, + "num_input_tokens_seen": 11671840, + "step": 12900 + }, + { + "epoch": 3.405965421670846, + "grad_norm": 0.0019445812795311213, + "learning_rate": 0.22933686112782758, + "loss": 0.0839, + "num_input_tokens_seen": 11676192, + "step": 12905 + }, + { + "epoch": 3.407285205226343, + "grad_norm": 0.0013293831143528223, + "learning_rate": 0.22928686384414698, + "loss": 0.1011, + "num_input_tokens_seen": 11680736, + "step": 12910 + }, + { + "epoch": 3.40860498878184, + "grad_norm": 0.0018014439847320318, + "learning_rate": 0.22923685433343552, + "loss": 0.0868, + "num_input_tokens_seen": 11685184, + "step": 12915 + }, + { + "epoch": 3.4099247723373365, + "grad_norm": 0.0016388142248615623, + "learning_rate": 0.22918683260340542, + "loss": 0.1377, + "num_input_tokens_seen": 11689664, + "step": 12920 + }, + { + "epoch": 3.4112445558928335, + "grad_norm": 0.003320357296615839, + "learning_rate": 0.2291367986617706, + "loss": 0.0696, + "num_input_tokens_seen": 11694496, + "step": 12925 + }, + { + "epoch": 3.4125643394483305, + "grad_norm": 0.004775720182806253, + "learning_rate": 0.22908675251624697, + "loss": 0.1117, + "num_input_tokens_seen": 11699008, + "step": 12930 + }, + { + "epoch": 3.4138841230038275, + "grad_norm": 0.002177212620154023, + "learning_rate": 0.22903669417455216, + "loss": 0.0662, + "num_input_tokens_seen": 11703360, + "step": 12935 + }, + { + "epoch": 3.4152039065593245, + "grad_norm": 0.0030343174003064632, + "learning_rate": 0.22898662364440592, + "loss": 0.1073, + "num_input_tokens_seen": 11707904, + "step": 12940 + }, + { + "epoch": 3.416523690114821, + "grad_norm": 0.002113307360559702, + "learning_rate": 0.2289365409335297, + "loss": 0.0621, + "num_input_tokens_seen": 11712576, + "step": 12945 + }, + { + "epoch": 3.417843473670318, + "grad_norm": 0.0038674839306622744, + "learning_rate": 0.2288864460496469, + "loss": 0.0982, + "num_input_tokens_seen": 11716832, + "step": 12950 + }, + { + "epoch": 3.419163257225815, + "grad_norm": 0.0030210937839001417, + "learning_rate": 0.22883633900048272, + "loss": 0.0909, + "num_input_tokens_seen": 11721312, + "step": 12955 + }, + { + "epoch": 3.420483040781312, + "grad_norm": 0.002797875553369522, + "learning_rate": 0.2287862197937644, + "loss": 0.1031, + "num_input_tokens_seen": 11725824, + "step": 12960 + }, + { + "epoch": 3.4218028243368086, + "grad_norm": 0.0020659933798015118, + "learning_rate": 0.2287360884372209, + "loss": 0.1044, + "num_input_tokens_seen": 11730368, + "step": 12965 + }, + { + "epoch": 3.4231226078923056, + "grad_norm": 0.001065328367985785, + "learning_rate": 0.22868594493858307, + "loss": 0.0735, + "num_input_tokens_seen": 11734688, + "step": 12970 + }, + { + "epoch": 3.4244423914478026, + "grad_norm": 0.0025667932350188494, + "learning_rate": 0.2286357893055837, + "loss": 0.0831, + "num_input_tokens_seen": 11739264, + "step": 12975 + }, + { + "epoch": 3.4257621750032996, + "grad_norm": 0.0032975163776427507, + "learning_rate": 0.22858562154595746, + "loss": 0.1066, + "num_input_tokens_seen": 11743680, + "step": 12980 + }, + { + "epoch": 3.427081958558796, + "grad_norm": 0.002418824005872011, + "learning_rate": 0.22853544166744078, + "loss": 0.0687, + "num_input_tokens_seen": 11747872, + "step": 12985 + }, + { + "epoch": 3.428401742114293, + "grad_norm": 0.0013334631221368909, + "learning_rate": 0.22848524967777206, + "loss": 0.0808, + "num_input_tokens_seen": 11752256, + "step": 12990 + }, + { + "epoch": 3.42972152566979, + "grad_norm": 0.0022142501547932625, + "learning_rate": 0.22843504558469152, + "loss": 0.0999, + "num_input_tokens_seen": 11756800, + "step": 12995 + }, + { + "epoch": 3.431041309225287, + "grad_norm": 0.0032696931157261133, + "learning_rate": 0.2283848293959413, + "loss": 0.0788, + "num_input_tokens_seen": 11761344, + "step": 13000 + }, + { + "epoch": 3.431041309225287, + "eval_loss": 0.10784798860549927, + "eval_runtime": 75.8003, + "eval_samples_per_second": 88.852, + "eval_steps_per_second": 22.216, + "num_input_tokens_seen": 11761344, + "step": 13000 + }, + { + "epoch": 3.432361092780784, + "grad_norm": 0.0025640607345849276, + "learning_rate": 0.22833460111926532, + "loss": 0.0881, + "num_input_tokens_seen": 11765632, + "step": 13005 + }, + { + "epoch": 3.4336808763362807, + "grad_norm": 0.0024766346905380487, + "learning_rate": 0.22828436076240946, + "loss": 0.0556, + "num_input_tokens_seen": 11769984, + "step": 13010 + }, + { + "epoch": 3.4350006598917777, + "grad_norm": 0.0015209575649350882, + "learning_rate": 0.22823410833312135, + "loss": 0.0436, + "num_input_tokens_seen": 11774400, + "step": 13015 + }, + { + "epoch": 3.4363204434472747, + "grad_norm": 0.003945228643715382, + "learning_rate": 0.2281838438391506, + "loss": 0.0942, + "num_input_tokens_seen": 11778912, + "step": 13020 + }, + { + "epoch": 3.4376402270027717, + "grad_norm": 0.0033741916995495558, + "learning_rate": 0.22813356728824863, + "loss": 0.0731, + "num_input_tokens_seen": 11783520, + "step": 13025 + }, + { + "epoch": 3.4389600105582683, + "grad_norm": 0.003221143502742052, + "learning_rate": 0.2280832786881687, + "loss": 0.1316, + "num_input_tokens_seen": 11788288, + "step": 13030 + }, + { + "epoch": 3.4402797941137653, + "grad_norm": 0.0031446588691323996, + "learning_rate": 0.22803297804666592, + "loss": 0.109, + "num_input_tokens_seen": 11792832, + "step": 13035 + }, + { + "epoch": 3.4415995776692623, + "grad_norm": 0.0016039996407926083, + "learning_rate": 0.22798266537149728, + "loss": 0.0704, + "num_input_tokens_seen": 11797440, + "step": 13040 + }, + { + "epoch": 3.4429193612247593, + "grad_norm": 0.002044868655502796, + "learning_rate": 0.22793234067042167, + "loss": 0.1005, + "num_input_tokens_seen": 11802144, + "step": 13045 + }, + { + "epoch": 3.444239144780256, + "grad_norm": 0.0014217199059203267, + "learning_rate": 0.22788200395119979, + "loss": 0.0851, + "num_input_tokens_seen": 11806528, + "step": 13050 + }, + { + "epoch": 3.445558928335753, + "grad_norm": 0.0031440812163054943, + "learning_rate": 0.2278316552215942, + "loss": 0.1073, + "num_input_tokens_seen": 11810880, + "step": 13055 + }, + { + "epoch": 3.44687871189125, + "grad_norm": 0.0018242362421005964, + "learning_rate": 0.22778129448936918, + "loss": 0.0704, + "num_input_tokens_seen": 11815424, + "step": 13060 + }, + { + "epoch": 3.448198495446747, + "grad_norm": 0.003504617605358362, + "learning_rate": 0.22773092176229118, + "loss": 0.145, + "num_input_tokens_seen": 11819904, + "step": 13065 + }, + { + "epoch": 3.449518279002244, + "grad_norm": 0.002408362226560712, + "learning_rate": 0.22768053704812816, + "loss": 0.1662, + "num_input_tokens_seen": 11824288, + "step": 13070 + }, + { + "epoch": 3.4508380625577404, + "grad_norm": 0.0005713790305890143, + "learning_rate": 0.22763014035465018, + "loss": 0.0784, + "num_input_tokens_seen": 11828768, + "step": 13075 + }, + { + "epoch": 3.4521578461132374, + "grad_norm": 0.0024742211680859327, + "learning_rate": 0.22757973168962892, + "loss": 0.0925, + "num_input_tokens_seen": 11833024, + "step": 13080 + }, + { + "epoch": 3.4534776296687344, + "grad_norm": 0.0011681696632876992, + "learning_rate": 0.22752931106083818, + "loss": 0.0762, + "num_input_tokens_seen": 11837696, + "step": 13085 + }, + { + "epoch": 3.4547974132242314, + "grad_norm": 0.002284741261973977, + "learning_rate": 0.22747887847605341, + "loss": 0.0839, + "num_input_tokens_seen": 11841984, + "step": 13090 + }, + { + "epoch": 3.456117196779728, + "grad_norm": 0.00449763098731637, + "learning_rate": 0.22742843394305184, + "loss": 0.0799, + "num_input_tokens_seen": 11846144, + "step": 13095 + }, + { + "epoch": 3.457436980335225, + "grad_norm": 0.0021212887950241566, + "learning_rate": 0.22737797746961272, + "loss": 0.0661, + "num_input_tokens_seen": 11850688, + "step": 13100 + }, + { + "epoch": 3.458756763890722, + "grad_norm": 0.002082928316667676, + "learning_rate": 0.22732750906351712, + "loss": 0.0664, + "num_input_tokens_seen": 11855232, + "step": 13105 + }, + { + "epoch": 3.460076547446219, + "grad_norm": 0.001628246740438044, + "learning_rate": 0.22727702873254785, + "loss": 0.0988, + "num_input_tokens_seen": 11859712, + "step": 13110 + }, + { + "epoch": 3.4613963310017155, + "grad_norm": 0.0003870512009598315, + "learning_rate": 0.22722653648448968, + "loss": 0.0874, + "num_input_tokens_seen": 11864288, + "step": 13115 + }, + { + "epoch": 3.4627161145572125, + "grad_norm": 0.002066558226943016, + "learning_rate": 0.22717603232712902, + "loss": 0.0671, + "num_input_tokens_seen": 11868512, + "step": 13120 + }, + { + "epoch": 3.4640358981127095, + "grad_norm": 0.0019448068924248219, + "learning_rate": 0.22712551626825436, + "loss": 0.0652, + "num_input_tokens_seen": 11873056, + "step": 13125 + }, + { + "epoch": 3.4653556816682065, + "grad_norm": 0.0024174905847758055, + "learning_rate": 0.2270749883156559, + "loss": 0.1164, + "num_input_tokens_seen": 11877696, + "step": 13130 + }, + { + "epoch": 3.4666754652237035, + "grad_norm": 0.0036634616553783417, + "learning_rate": 0.22702444847712563, + "loss": 0.0709, + "num_input_tokens_seen": 11882176, + "step": 13135 + }, + { + "epoch": 3.4679952487792, + "grad_norm": 0.002207121578976512, + "learning_rate": 0.22697389676045743, + "loss": 0.0952, + "num_input_tokens_seen": 11886560, + "step": 13140 + }, + { + "epoch": 3.469315032334697, + "grad_norm": 0.0037766173481941223, + "learning_rate": 0.22692333317344704, + "loss": 0.0812, + "num_input_tokens_seen": 11891168, + "step": 13145 + }, + { + "epoch": 3.470634815890194, + "grad_norm": 0.0035979740787297487, + "learning_rate": 0.22687275772389198, + "loss": 0.0896, + "num_input_tokens_seen": 11895744, + "step": 13150 + }, + { + "epoch": 3.471954599445691, + "grad_norm": 0.0011578600388020277, + "learning_rate": 0.22682217041959168, + "loss": 0.0608, + "num_input_tokens_seen": 11900032, + "step": 13155 + }, + { + "epoch": 3.4732743830011876, + "grad_norm": 0.0016292002983391285, + "learning_rate": 0.2267715712683473, + "loss": 0.0787, + "num_input_tokens_seen": 11904480, + "step": 13160 + }, + { + "epoch": 3.4745941665566846, + "grad_norm": 0.0010407086228951812, + "learning_rate": 0.22672096027796182, + "loss": 0.0402, + "num_input_tokens_seen": 11908704, + "step": 13165 + }, + { + "epoch": 3.4759139501121816, + "grad_norm": 0.0019811256788671017, + "learning_rate": 0.22667033745624016, + "loss": 0.1192, + "num_input_tokens_seen": 11913344, + "step": 13170 + }, + { + "epoch": 3.4772337336676786, + "grad_norm": 0.002843362046405673, + "learning_rate": 0.22661970281098895, + "loss": 0.1299, + "num_input_tokens_seen": 11917760, + "step": 13175 + }, + { + "epoch": 3.478553517223175, + "grad_norm": 0.0023837226908653975, + "learning_rate": 0.22656905635001667, + "loss": 0.087, + "num_input_tokens_seen": 11922336, + "step": 13180 + }, + { + "epoch": 3.479873300778672, + "grad_norm": 0.004258858039975166, + "learning_rate": 0.2265183980811337, + "loss": 0.1085, + "num_input_tokens_seen": 11926912, + "step": 13185 + }, + { + "epoch": 3.481193084334169, + "grad_norm": 0.00182383565697819, + "learning_rate": 0.22646772801215218, + "loss": 0.0776, + "num_input_tokens_seen": 11931680, + "step": 13190 + }, + { + "epoch": 3.482512867889666, + "grad_norm": 0.0011181936133652925, + "learning_rate": 0.22641704615088598, + "loss": 0.0649, + "num_input_tokens_seen": 11936288, + "step": 13195 + }, + { + "epoch": 3.483832651445163, + "grad_norm": 0.0019333136733621359, + "learning_rate": 0.22636635250515103, + "loss": 0.0954, + "num_input_tokens_seen": 11940800, + "step": 13200 + }, + { + "epoch": 3.483832651445163, + "eval_loss": 0.10083501040935516, + "eval_runtime": 75.9825, + "eval_samples_per_second": 88.639, + "eval_steps_per_second": 22.163, + "num_input_tokens_seen": 11940800, + "step": 13200 + }, + { + "epoch": 3.4851524350006597, + "grad_norm": 0.002804814139381051, + "learning_rate": 0.2263156470827648, + "loss": 0.1145, + "num_input_tokens_seen": 11945248, + "step": 13205 + }, + { + "epoch": 3.4864722185561567, + "grad_norm": 0.002997297327965498, + "learning_rate": 0.22626492989154678, + "loss": 0.1061, + "num_input_tokens_seen": 11949984, + "step": 13210 + }, + { + "epoch": 3.4877920021116537, + "grad_norm": 0.0011632846435531974, + "learning_rate": 0.22621420093931813, + "loss": 0.0756, + "num_input_tokens_seen": 11954208, + "step": 13215 + }, + { + "epoch": 3.4891117856671507, + "grad_norm": 0.0039224084466695786, + "learning_rate": 0.22616346023390194, + "loss": 0.0942, + "num_input_tokens_seen": 11959008, + "step": 13220 + }, + { + "epoch": 3.4904315692226473, + "grad_norm": 0.00254047941416502, + "learning_rate": 0.22611270778312306, + "loss": 0.1034, + "num_input_tokens_seen": 11963360, + "step": 13225 + }, + { + "epoch": 3.4917513527781443, + "grad_norm": 0.0021142729092389345, + "learning_rate": 0.2260619435948081, + "loss": 0.0689, + "num_input_tokens_seen": 11967808, + "step": 13230 + }, + { + "epoch": 3.4930711363336413, + "grad_norm": 0.0026637210976332426, + "learning_rate": 0.22601116767678567, + "loss": 0.1225, + "num_input_tokens_seen": 11972320, + "step": 13235 + }, + { + "epoch": 3.4943909198891383, + "grad_norm": 0.0038685593754053116, + "learning_rate": 0.2259603800368859, + "loss": 0.0661, + "num_input_tokens_seen": 11976640, + "step": 13240 + }, + { + "epoch": 3.495710703444635, + "grad_norm": 0.0014911028556525707, + "learning_rate": 0.22590958068294098, + "loss": 0.1031, + "num_input_tokens_seen": 11981056, + "step": 13245 + }, + { + "epoch": 3.497030487000132, + "grad_norm": 0.0012050126679241657, + "learning_rate": 0.22585876962278478, + "loss": 0.0679, + "num_input_tokens_seen": 11985632, + "step": 13250 + }, + { + "epoch": 3.498350270555629, + "grad_norm": 0.001313688000664115, + "learning_rate": 0.22580794686425298, + "loss": 0.0645, + "num_input_tokens_seen": 11990240, + "step": 13255 + }, + { + "epoch": 3.499670054111126, + "grad_norm": 0.0022880220785737038, + "learning_rate": 0.22575711241518312, + "loss": 0.117, + "num_input_tokens_seen": 11995136, + "step": 13260 + }, + { + "epoch": 3.500989837666623, + "grad_norm": 0.0025322220753878355, + "learning_rate": 0.22570626628341453, + "loss": 0.0831, + "num_input_tokens_seen": 11999776, + "step": 13265 + }, + { + "epoch": 3.5023096212221194, + "grad_norm": 0.001707068644464016, + "learning_rate": 0.22565540847678828, + "loss": 0.082, + "num_input_tokens_seen": 12004256, + "step": 13270 + }, + { + "epoch": 3.5036294047776164, + "grad_norm": 0.004083871375769377, + "learning_rate": 0.2256045390031473, + "loss": 0.0929, + "num_input_tokens_seen": 12008448, + "step": 13275 + }, + { + "epoch": 3.5049491883331134, + "grad_norm": 0.0018466482870280743, + "learning_rate": 0.22555365787033627, + "loss": 0.0719, + "num_input_tokens_seen": 12012608, + "step": 13280 + }, + { + "epoch": 3.5062689718886104, + "grad_norm": 0.0025020947214215994, + "learning_rate": 0.22550276508620173, + "loss": 0.1144, + "num_input_tokens_seen": 12017152, + "step": 13285 + }, + { + "epoch": 3.5075887554441074, + "grad_norm": 0.002394933020696044, + "learning_rate": 0.22545186065859202, + "loss": 0.111, + "num_input_tokens_seen": 12021696, + "step": 13290 + }, + { + "epoch": 3.508908538999604, + "grad_norm": 0.0020820919889956713, + "learning_rate": 0.2254009445953572, + "loss": 0.0972, + "num_input_tokens_seen": 12026112, + "step": 13295 + }, + { + "epoch": 3.510228322555101, + "grad_norm": 0.0030637041199952364, + "learning_rate": 0.22535001690434917, + "loss": 0.1082, + "num_input_tokens_seen": 12030432, + "step": 13300 + }, + { + "epoch": 3.511548106110598, + "grad_norm": 0.001196212600916624, + "learning_rate": 0.22529907759342163, + "loss": 0.0954, + "num_input_tokens_seen": 12034752, + "step": 13305 + }, + { + "epoch": 3.5128678896660945, + "grad_norm": 0.001412897021509707, + "learning_rate": 0.22524812667043007, + "loss": 0.067, + "num_input_tokens_seen": 12039424, + "step": 13310 + }, + { + "epoch": 3.5141876732215915, + "grad_norm": 0.0009816689416766167, + "learning_rate": 0.22519716414323177, + "loss": 0.0805, + "num_input_tokens_seen": 12044032, + "step": 13315 + }, + { + "epoch": 3.5155074567770885, + "grad_norm": 0.003305622609332204, + "learning_rate": 0.22514619001968567, + "loss": 0.0951, + "num_input_tokens_seen": 12048736, + "step": 13320 + }, + { + "epoch": 3.5168272403325855, + "grad_norm": 0.0020402774680405855, + "learning_rate": 0.2250952043076528, + "loss": 0.0906, + "num_input_tokens_seen": 12053152, + "step": 13325 + }, + { + "epoch": 3.5181470238880825, + "grad_norm": 0.001485598972067237, + "learning_rate": 0.2250442070149957, + "loss": 0.079, + "num_input_tokens_seen": 12057472, + "step": 13330 + }, + { + "epoch": 3.519466807443579, + "grad_norm": 0.004816355183720589, + "learning_rate": 0.22499319814957885, + "loss": 0.0973, + "num_input_tokens_seen": 12062304, + "step": 13335 + }, + { + "epoch": 3.520786590999076, + "grad_norm": 0.000855968042742461, + "learning_rate": 0.2249421777192684, + "loss": 0.068, + "num_input_tokens_seen": 12067008, + "step": 13340 + }, + { + "epoch": 3.522106374554573, + "grad_norm": 0.00270860455930233, + "learning_rate": 0.22489114573193236, + "loss": 0.1221, + "num_input_tokens_seen": 12071104, + "step": 13345 + }, + { + "epoch": 3.52342615811007, + "grad_norm": 0.0021999995224177837, + "learning_rate": 0.2248401021954405, + "loss": 0.1076, + "num_input_tokens_seen": 12075520, + "step": 13350 + }, + { + "epoch": 3.524745941665567, + "grad_norm": 0.0013666246086359024, + "learning_rate": 0.22478904711766443, + "loss": 0.0819, + "num_input_tokens_seen": 12080064, + "step": 13355 + }, + { + "epoch": 3.5260657252210637, + "grad_norm": 0.0011979510309174657, + "learning_rate": 0.22473798050647734, + "loss": 0.1117, + "num_input_tokens_seen": 12084832, + "step": 13360 + }, + { + "epoch": 3.5273855087765607, + "grad_norm": 0.002293029800057411, + "learning_rate": 0.22468690236975453, + "loss": 0.1065, + "num_input_tokens_seen": 12089280, + "step": 13365 + }, + { + "epoch": 3.5287052923320577, + "grad_norm": 0.0028510086704045534, + "learning_rate": 0.22463581271537272, + "loss": 0.08, + "num_input_tokens_seen": 12094112, + "step": 13370 + }, + { + "epoch": 3.530025075887554, + "grad_norm": 0.002234297338873148, + "learning_rate": 0.22458471155121076, + "loss": 0.1079, + "num_input_tokens_seen": 12098368, + "step": 13375 + }, + { + "epoch": 3.531344859443051, + "grad_norm": 0.0013895391020923853, + "learning_rate": 0.2245335988851489, + "loss": 0.0778, + "num_input_tokens_seen": 12102784, + "step": 13380 + }, + { + "epoch": 3.532664642998548, + "grad_norm": 0.0014978019753471017, + "learning_rate": 0.2244824747250695, + "loss": 0.0905, + "num_input_tokens_seen": 12107392, + "step": 13385 + }, + { + "epoch": 3.533984426554045, + "grad_norm": 0.0015629493864253163, + "learning_rate": 0.22443133907885646, + "loss": 0.0909, + "num_input_tokens_seen": 12112032, + "step": 13390 + }, + { + "epoch": 3.535304210109542, + "grad_norm": 0.0008126117172650993, + "learning_rate": 0.22438019195439557, + "loss": 0.0746, + "num_input_tokens_seen": 12116544, + "step": 13395 + }, + { + "epoch": 3.5366239936650388, + "grad_norm": 0.0016100992215797305, + "learning_rate": 0.22432903335957435, + "loss": 0.0898, + "num_input_tokens_seen": 12121216, + "step": 13400 + }, + { + "epoch": 3.5366239936650388, + "eval_loss": 0.10647682100534439, + "eval_runtime": 75.7968, + "eval_samples_per_second": 88.856, + "eval_steps_per_second": 22.217, + "num_input_tokens_seen": 12121216, + "step": 13400 + }, + { + "epoch": 3.5379437772205358, + "grad_norm": 0.002666928106918931, + "learning_rate": 0.22427786330228214, + "loss": 0.081, + "num_input_tokens_seen": 12126048, + "step": 13405 + }, + { + "epoch": 3.5392635607760328, + "grad_norm": 0.002295644488185644, + "learning_rate": 0.22422668179040997, + "loss": 0.0869, + "num_input_tokens_seen": 12130560, + "step": 13410 + }, + { + "epoch": 3.5405833443315298, + "grad_norm": 0.002753976034000516, + "learning_rate": 0.2241754888318507, + "loss": 0.0822, + "num_input_tokens_seen": 12134816, + "step": 13415 + }, + { + "epoch": 3.5419031278870268, + "grad_norm": 0.002262512454763055, + "learning_rate": 0.22412428443449886, + "loss": 0.0767, + "num_input_tokens_seen": 12139392, + "step": 13420 + }, + { + "epoch": 3.5432229114425233, + "grad_norm": 0.0031567171681672335, + "learning_rate": 0.22407306860625087, + "loss": 0.1232, + "num_input_tokens_seen": 12143872, + "step": 13425 + }, + { + "epoch": 3.5445426949980203, + "grad_norm": 0.0019773715175688267, + "learning_rate": 0.22402184135500483, + "loss": 0.0893, + "num_input_tokens_seen": 12148544, + "step": 13430 + }, + { + "epoch": 3.5458624785535173, + "grad_norm": 0.0035381808411329985, + "learning_rate": 0.22397060268866067, + "loss": 0.087, + "num_input_tokens_seen": 12153152, + "step": 13435 + }, + { + "epoch": 3.547182262109014, + "grad_norm": 0.0011709033278748393, + "learning_rate": 0.22391935261511994, + "loss": 0.0701, + "num_input_tokens_seen": 12157408, + "step": 13440 + }, + { + "epoch": 3.548502045664511, + "grad_norm": 0.004556722939014435, + "learning_rate": 0.22386809114228615, + "loss": 0.0688, + "num_input_tokens_seen": 12161792, + "step": 13445 + }, + { + "epoch": 3.549821829220008, + "grad_norm": 0.0008507507154718041, + "learning_rate": 0.22381681827806446, + "loss": 0.1063, + "num_input_tokens_seen": 12166016, + "step": 13450 + }, + { + "epoch": 3.551141612775505, + "grad_norm": 0.002073005074635148, + "learning_rate": 0.22376553403036173, + "loss": 0.0919, + "num_input_tokens_seen": 12170624, + "step": 13455 + }, + { + "epoch": 3.552461396331002, + "grad_norm": 0.0015820745611563325, + "learning_rate": 0.22371423840708662, + "loss": 0.1211, + "num_input_tokens_seen": 12174976, + "step": 13460 + }, + { + "epoch": 3.5537811798864984, + "grad_norm": 0.002960552228614688, + "learning_rate": 0.22366293141614962, + "loss": 0.1143, + "num_input_tokens_seen": 12179232, + "step": 13465 + }, + { + "epoch": 3.5551009634419954, + "grad_norm": 0.0017035522032529116, + "learning_rate": 0.22361161306546287, + "loss": 0.0858, + "num_input_tokens_seen": 12183808, + "step": 13470 + }, + { + "epoch": 3.5564207469974924, + "grad_norm": 0.0006253696628846228, + "learning_rate": 0.22356028336294037, + "loss": 0.0814, + "num_input_tokens_seen": 12188576, + "step": 13475 + }, + { + "epoch": 3.5577405305529894, + "grad_norm": 0.001248989487066865, + "learning_rate": 0.2235089423164977, + "loss": 0.0687, + "num_input_tokens_seen": 12192992, + "step": 13480 + }, + { + "epoch": 3.5590603141084864, + "grad_norm": 0.0034283592831343412, + "learning_rate": 0.22345758993405243, + "loss": 0.1404, + "num_input_tokens_seen": 12197312, + "step": 13485 + }, + { + "epoch": 3.560380097663983, + "grad_norm": 0.0017004150431603193, + "learning_rate": 0.2234062262235236, + "loss": 0.1039, + "num_input_tokens_seen": 12201856, + "step": 13490 + }, + { + "epoch": 3.56169988121948, + "grad_norm": 0.0017208356875926256, + "learning_rate": 0.22335485119283222, + "loss": 0.0952, + "num_input_tokens_seen": 12206560, + "step": 13495 + }, + { + "epoch": 3.563019664774977, + "grad_norm": 0.0029007326811552048, + "learning_rate": 0.22330346484990093, + "loss": 0.0983, + "num_input_tokens_seen": 12211136, + "step": 13500 + }, + { + "epoch": 3.5643394483304736, + "grad_norm": 0.0014950040495023131, + "learning_rate": 0.22325206720265425, + "loss": 0.129, + "num_input_tokens_seen": 12216032, + "step": 13505 + }, + { + "epoch": 3.5656592318859706, + "grad_norm": 0.0016484420048072934, + "learning_rate": 0.2232006582590182, + "loss": 0.0672, + "num_input_tokens_seen": 12220544, + "step": 13510 + }, + { + "epoch": 3.5669790154414676, + "grad_norm": 0.0011062604608014226, + "learning_rate": 0.22314923802692077, + "loss": 0.0546, + "num_input_tokens_seen": 12225216, + "step": 13515 + }, + { + "epoch": 3.5682987989969646, + "grad_norm": 0.002459988696500659, + "learning_rate": 0.22309780651429156, + "loss": 0.0929, + "num_input_tokens_seen": 12229408, + "step": 13520 + }, + { + "epoch": 3.5696185825524616, + "grad_norm": 0.003024352015927434, + "learning_rate": 0.22304636372906203, + "loss": 0.0954, + "num_input_tokens_seen": 12234240, + "step": 13525 + }, + { + "epoch": 3.570938366107958, + "grad_norm": 0.002660153666511178, + "learning_rate": 0.22299490967916522, + "loss": 0.078, + "num_input_tokens_seen": 12238848, + "step": 13530 + }, + { + "epoch": 3.572258149663455, + "grad_norm": 0.0016872932901605964, + "learning_rate": 0.22294344437253602, + "loss": 0.093, + "num_input_tokens_seen": 12243328, + "step": 13535 + }, + { + "epoch": 3.573577933218952, + "grad_norm": 0.0030493405647575855, + "learning_rate": 0.22289196781711101, + "loss": 0.0862, + "num_input_tokens_seen": 12247904, + "step": 13540 + }, + { + "epoch": 3.574897716774449, + "grad_norm": 0.0029718182049691677, + "learning_rate": 0.2228404800208286, + "loss": 0.0892, + "num_input_tokens_seen": 12252576, + "step": 13545 + }, + { + "epoch": 3.576217500329946, + "grad_norm": 0.0021064402535557747, + "learning_rate": 0.22278898099162875, + "loss": 0.0841, + "num_input_tokens_seen": 12257440, + "step": 13550 + }, + { + "epoch": 3.5775372838854427, + "grad_norm": 0.002108602551743388, + "learning_rate": 0.22273747073745337, + "loss": 0.0661, + "num_input_tokens_seen": 12261920, + "step": 13555 + }, + { + "epoch": 3.5788570674409397, + "grad_norm": 0.0030991612002253532, + "learning_rate": 0.22268594926624588, + "loss": 0.1279, + "num_input_tokens_seen": 12266528, + "step": 13560 + }, + { + "epoch": 3.5801768509964367, + "grad_norm": 0.0014665157068520784, + "learning_rate": 0.22263441658595162, + "loss": 0.1306, + "num_input_tokens_seen": 12271168, + "step": 13565 + }, + { + "epoch": 3.5814966345519332, + "grad_norm": 0.0018675988540053368, + "learning_rate": 0.2225828727045175, + "loss": 0.1173, + "num_input_tokens_seen": 12275424, + "step": 13570 + }, + { + "epoch": 3.5828164181074302, + "grad_norm": 0.002483902731910348, + "learning_rate": 0.22253131762989228, + "loss": 0.088, + "num_input_tokens_seen": 12279904, + "step": 13575 + }, + { + "epoch": 3.5841362016629272, + "grad_norm": 0.0023345500230789185, + "learning_rate": 0.2224797513700264, + "loss": 0.0773, + "num_input_tokens_seen": 12284384, + "step": 13580 + }, + { + "epoch": 3.5854559852184242, + "grad_norm": 0.0014216945273801684, + "learning_rate": 0.22242817393287204, + "loss": 0.0752, + "num_input_tokens_seen": 12288736, + "step": 13585 + }, + { + "epoch": 3.5867757687739212, + "grad_norm": 0.001681089517660439, + "learning_rate": 0.22237658532638305, + "loss": 0.114, + "num_input_tokens_seen": 12293088, + "step": 13590 + }, + { + "epoch": 3.588095552329418, + "grad_norm": 0.0017862934619188309, + "learning_rate": 0.22232498555851513, + "loss": 0.0954, + "num_input_tokens_seen": 12297344, + "step": 13595 + }, + { + "epoch": 3.589415335884915, + "grad_norm": 0.0020588140469044447, + "learning_rate": 0.22227337463722546, + "loss": 0.0888, + "num_input_tokens_seen": 12302080, + "step": 13600 + }, + { + "epoch": 3.589415335884915, + "eval_loss": 0.11231720447540283, + "eval_runtime": 75.95, + "eval_samples_per_second": 88.677, + "eval_steps_per_second": 22.172, + "num_input_tokens_seen": 12302080, + "step": 13600 + }, + { + "epoch": 3.590735119440412, + "grad_norm": 0.002944505773484707, + "learning_rate": 0.2222217525704732, + "loss": 0.1061, + "num_input_tokens_seen": 12306528, + "step": 13605 + }, + { + "epoch": 3.592054902995909, + "grad_norm": 0.001126070274040103, + "learning_rate": 0.22217011936621908, + "loss": 0.0669, + "num_input_tokens_seen": 12310976, + "step": 13610 + }, + { + "epoch": 3.593374686551406, + "grad_norm": 0.004172799177467823, + "learning_rate": 0.22211847503242566, + "loss": 0.1254, + "num_input_tokens_seen": 12315616, + "step": 13615 + }, + { + "epoch": 3.5946944701069024, + "grad_norm": 0.0023311618715524673, + "learning_rate": 0.22206681957705704, + "loss": 0.0948, + "num_input_tokens_seen": 12320256, + "step": 13620 + }, + { + "epoch": 3.5960142536623994, + "grad_norm": 0.0016310252249240875, + "learning_rate": 0.2220151530080792, + "loss": 0.0729, + "num_input_tokens_seen": 12324288, + "step": 13625 + }, + { + "epoch": 3.5973340372178964, + "grad_norm": 0.002837464911863208, + "learning_rate": 0.2219634753334598, + "loss": 0.1324, + "num_input_tokens_seen": 12328768, + "step": 13630 + }, + { + "epoch": 3.598653820773393, + "grad_norm": 0.0015164921060204506, + "learning_rate": 0.22191178656116817, + "loss": 0.0831, + "num_input_tokens_seen": 12333120, + "step": 13635 + }, + { + "epoch": 3.59997360432889, + "grad_norm": 0.0017010350711643696, + "learning_rate": 0.2218600866991753, + "loss": 0.0737, + "num_input_tokens_seen": 12337408, + "step": 13640 + }, + { + "epoch": 3.601293387884387, + "grad_norm": 0.0016940825153142214, + "learning_rate": 0.221808375755454, + "loss": 0.1349, + "num_input_tokens_seen": 12341888, + "step": 13645 + }, + { + "epoch": 3.602613171439884, + "grad_norm": 0.0013015944277867675, + "learning_rate": 0.22175665373797881, + "loss": 0.0578, + "num_input_tokens_seen": 12346432, + "step": 13650 + }, + { + "epoch": 3.603932954995381, + "grad_norm": 0.0008277000160887837, + "learning_rate": 0.22170492065472583, + "loss": 0.1159, + "num_input_tokens_seen": 12351008, + "step": 13655 + }, + { + "epoch": 3.6052527385508775, + "grad_norm": 0.003119373694062233, + "learning_rate": 0.221653176513673, + "loss": 0.0851, + "num_input_tokens_seen": 12355584, + "step": 13660 + }, + { + "epoch": 3.6065725221063745, + "grad_norm": 0.003198528429493308, + "learning_rate": 0.2216014213227999, + "loss": 0.112, + "num_input_tokens_seen": 12360256, + "step": 13665 + }, + { + "epoch": 3.6078923056618715, + "grad_norm": 0.0023944824934005737, + "learning_rate": 0.22154965509008784, + "loss": 0.1155, + "num_input_tokens_seen": 12364800, + "step": 13670 + }, + { + "epoch": 3.6092120892173685, + "grad_norm": 0.0020730034448206425, + "learning_rate": 0.2214978778235198, + "loss": 0.0655, + "num_input_tokens_seen": 12369408, + "step": 13675 + }, + { + "epoch": 3.6105318727728655, + "grad_norm": 0.0024802081752568483, + "learning_rate": 0.2214460895310805, + "loss": 0.0968, + "num_input_tokens_seen": 12373824, + "step": 13680 + }, + { + "epoch": 3.611851656328362, + "grad_norm": 0.0019767009653151035, + "learning_rate": 0.22139429022075635, + "loss": 0.1302, + "num_input_tokens_seen": 12378368, + "step": 13685 + }, + { + "epoch": 3.613171439883859, + "grad_norm": 0.0019710769411176443, + "learning_rate": 0.22134247990053546, + "loss": 0.1164, + "num_input_tokens_seen": 12382944, + "step": 13690 + }, + { + "epoch": 3.614491223439356, + "grad_norm": 0.0019431919790804386, + "learning_rate": 0.2212906585784076, + "loss": 0.074, + "num_input_tokens_seen": 12387488, + "step": 13695 + }, + { + "epoch": 3.6158110069948526, + "grad_norm": 0.0016858529997989535, + "learning_rate": 0.22123882626236432, + "loss": 0.0836, + "num_input_tokens_seen": 12392064, + "step": 13700 + }, + { + "epoch": 3.6171307905503496, + "grad_norm": 0.003984855022281408, + "learning_rate": 0.2211869829603988, + "loss": 0.1345, + "num_input_tokens_seen": 12396608, + "step": 13705 + }, + { + "epoch": 3.6184505741058466, + "grad_norm": 0.002295352751389146, + "learning_rate": 0.22113512868050592, + "loss": 0.1005, + "num_input_tokens_seen": 12401344, + "step": 13710 + }, + { + "epoch": 3.6197703576613436, + "grad_norm": 0.0015935093397274613, + "learning_rate": 0.2210832634306822, + "loss": 0.0689, + "num_input_tokens_seen": 12405728, + "step": 13715 + }, + { + "epoch": 3.6210901412168406, + "grad_norm": 0.0019377759890630841, + "learning_rate": 0.22103138721892598, + "loss": 0.1101, + "num_input_tokens_seen": 12410336, + "step": 13720 + }, + { + "epoch": 3.622409924772337, + "grad_norm": 0.0030023441649973392, + "learning_rate": 0.22097950005323724, + "loss": 0.107, + "num_input_tokens_seen": 12414976, + "step": 13725 + }, + { + "epoch": 3.623729708327834, + "grad_norm": 0.0011654554400593042, + "learning_rate": 0.22092760194161762, + "loss": 0.1252, + "num_input_tokens_seen": 12419424, + "step": 13730 + }, + { + "epoch": 3.625049491883331, + "grad_norm": 0.0027751245070248842, + "learning_rate": 0.2208756928920704, + "loss": 0.1013, + "num_input_tokens_seen": 12423904, + "step": 13735 + }, + { + "epoch": 3.626369275438828, + "grad_norm": 0.0019410866079851985, + "learning_rate": 0.22082377291260072, + "loss": 0.1098, + "num_input_tokens_seen": 12428480, + "step": 13740 + }, + { + "epoch": 3.627689058994325, + "grad_norm": 0.0031078935135155916, + "learning_rate": 0.2207718420112152, + "loss": 0.0865, + "num_input_tokens_seen": 12433184, + "step": 13745 + }, + { + "epoch": 3.6290088425498217, + "grad_norm": 0.003575708484277129, + "learning_rate": 0.22071990019592228, + "loss": 0.114, + "num_input_tokens_seen": 12437472, + "step": 13750 + }, + { + "epoch": 3.6303286261053187, + "grad_norm": 0.0010785864433273673, + "learning_rate": 0.22066794747473198, + "loss": 0.0793, + "num_input_tokens_seen": 12441824, + "step": 13755 + }, + { + "epoch": 3.6316484096608157, + "grad_norm": 0.0018253701273351908, + "learning_rate": 0.2206159838556562, + "loss": 0.0765, + "num_input_tokens_seen": 12446208, + "step": 13760 + }, + { + "epoch": 3.6329681932163123, + "grad_norm": 0.001636011409573257, + "learning_rate": 0.2205640093467082, + "loss": 0.1171, + "num_input_tokens_seen": 12450816, + "step": 13765 + }, + { + "epoch": 3.6342879767718093, + "grad_norm": 0.0009232640732079744, + "learning_rate": 0.22051202395590322, + "loss": 0.0541, + "num_input_tokens_seen": 12455360, + "step": 13770 + }, + { + "epoch": 3.6356077603273063, + "grad_norm": 0.0009999643079936504, + "learning_rate": 0.22046002769125808, + "loss": 0.0928, + "num_input_tokens_seen": 12459744, + "step": 13775 + }, + { + "epoch": 3.6369275438828033, + "grad_norm": 0.0024548140354454517, + "learning_rate": 0.2204080205607912, + "loss": 0.0941, + "num_input_tokens_seen": 12464352, + "step": 13780 + }, + { + "epoch": 3.6382473274383003, + "grad_norm": 0.0016494286246597767, + "learning_rate": 0.22035600257252272, + "loss": 0.0899, + "num_input_tokens_seen": 12468832, + "step": 13785 + }, + { + "epoch": 3.639567110993797, + "grad_norm": 0.0012795644579455256, + "learning_rate": 0.2203039737344745, + "loss": 0.0958, + "num_input_tokens_seen": 12473280, + "step": 13790 + }, + { + "epoch": 3.640886894549294, + "grad_norm": 0.001717873034067452, + "learning_rate": 0.22025193405467003, + "loss": 0.0761, + "num_input_tokens_seen": 12478016, + "step": 13795 + }, + { + "epoch": 3.642206678104791, + "grad_norm": 0.0017649562796577811, + "learning_rate": 0.2201998835411345, + "loss": 0.0668, + "num_input_tokens_seen": 12482400, + "step": 13800 + }, + { + "epoch": 3.642206678104791, + "eval_loss": 0.11229463666677475, + "eval_runtime": 75.9106, + "eval_samples_per_second": 88.723, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 12482400, + "step": 13800 + }, + { + "epoch": 3.643526461660288, + "grad_norm": 0.0028610171284526587, + "learning_rate": 0.22014782220189474, + "loss": 0.1086, + "num_input_tokens_seen": 12487168, + "step": 13805 + }, + { + "epoch": 3.644846245215785, + "grad_norm": 0.002879627048969269, + "learning_rate": 0.2200957500449793, + "loss": 0.0943, + "num_input_tokens_seen": 12491872, + "step": 13810 + }, + { + "epoch": 3.6461660287712814, + "grad_norm": 0.0019081501523032784, + "learning_rate": 0.22004366707841827, + "loss": 0.1218, + "num_input_tokens_seen": 12496544, + "step": 13815 + }, + { + "epoch": 3.6474858123267784, + "grad_norm": 0.002490558661520481, + "learning_rate": 0.21999157331024358, + "loss": 0.1316, + "num_input_tokens_seen": 12500928, + "step": 13820 + }, + { + "epoch": 3.6488055958822754, + "grad_norm": 0.002213394269347191, + "learning_rate": 0.21993946874848871, + "loss": 0.1002, + "num_input_tokens_seen": 12505920, + "step": 13825 + }, + { + "epoch": 3.650125379437772, + "grad_norm": 0.00053006800590083, + "learning_rate": 0.2198873534011888, + "loss": 0.0553, + "num_input_tokens_seen": 12510560, + "step": 13830 + }, + { + "epoch": 3.6514451629932694, + "grad_norm": 0.0007612030603922904, + "learning_rate": 0.2198352272763808, + "loss": 0.0847, + "num_input_tokens_seen": 12515200, + "step": 13835 + }, + { + "epoch": 3.652764946548766, + "grad_norm": 0.002215442480519414, + "learning_rate": 0.2197830903821031, + "loss": 0.0998, + "num_input_tokens_seen": 12519680, + "step": 13840 + }, + { + "epoch": 3.654084730104263, + "grad_norm": 0.0035122917033731937, + "learning_rate": 0.21973094272639598, + "loss": 0.1326, + "num_input_tokens_seen": 12524096, + "step": 13845 + }, + { + "epoch": 3.65540451365976, + "grad_norm": 0.0016636907821521163, + "learning_rate": 0.21967878431730117, + "loss": 0.0668, + "num_input_tokens_seen": 12528960, + "step": 13850 + }, + { + "epoch": 3.6567242972152565, + "grad_norm": 0.0028836685232818127, + "learning_rate": 0.21962661516286217, + "loss": 0.0727, + "num_input_tokens_seen": 12533184, + "step": 13855 + }, + { + "epoch": 3.6580440807707535, + "grad_norm": 0.0023338417522609234, + "learning_rate": 0.21957443527112414, + "loss": 0.0911, + "num_input_tokens_seen": 12537664, + "step": 13860 + }, + { + "epoch": 3.6593638643262505, + "grad_norm": 0.00291541893966496, + "learning_rate": 0.21952224465013384, + "loss": 0.0988, + "num_input_tokens_seen": 12542112, + "step": 13865 + }, + { + "epoch": 3.6606836478817475, + "grad_norm": 0.0028167737182229757, + "learning_rate": 0.21947004330793976, + "loss": 0.1203, + "num_input_tokens_seen": 12546688, + "step": 13870 + }, + { + "epoch": 3.6620034314372445, + "grad_norm": 0.001837177318520844, + "learning_rate": 0.21941783125259198, + "loss": 0.0716, + "num_input_tokens_seen": 12551200, + "step": 13875 + }, + { + "epoch": 3.663323214992741, + "grad_norm": 0.0010149539448320866, + "learning_rate": 0.21936560849214226, + "loss": 0.0865, + "num_input_tokens_seen": 12555840, + "step": 13880 + }, + { + "epoch": 3.664642998548238, + "grad_norm": 0.003152852412313223, + "learning_rate": 0.21931337503464404, + "loss": 0.1122, + "num_input_tokens_seen": 12560224, + "step": 13885 + }, + { + "epoch": 3.665962782103735, + "grad_norm": 0.0024800922255963087, + "learning_rate": 0.21926113088815233, + "loss": 0.1112, + "num_input_tokens_seen": 12564928, + "step": 13890 + }, + { + "epoch": 3.667282565659232, + "grad_norm": 0.0014756250893697143, + "learning_rate": 0.2192088760607238, + "loss": 0.0729, + "num_input_tokens_seen": 12569472, + "step": 13895 + }, + { + "epoch": 3.668602349214729, + "grad_norm": 0.0015490367077291012, + "learning_rate": 0.2191566105604169, + "loss": 0.0668, + "num_input_tokens_seen": 12573888, + "step": 13900 + }, + { + "epoch": 3.6699221327702256, + "grad_norm": 0.0013653725618496537, + "learning_rate": 0.21910433439529153, + "loss": 0.0742, + "num_input_tokens_seen": 12578496, + "step": 13905 + }, + { + "epoch": 3.6712419163257226, + "grad_norm": 0.0025135038886219263, + "learning_rate": 0.2190520475734094, + "loss": 0.074, + "num_input_tokens_seen": 12583008, + "step": 13910 + }, + { + "epoch": 3.6725616998812196, + "grad_norm": 0.0013100848300382495, + "learning_rate": 0.2189997501028338, + "loss": 0.0586, + "num_input_tokens_seen": 12587328, + "step": 13915 + }, + { + "epoch": 3.673881483436716, + "grad_norm": 0.001298001385293901, + "learning_rate": 0.2189474419916296, + "loss": 0.0718, + "num_input_tokens_seen": 12592032, + "step": 13920 + }, + { + "epoch": 3.675201266992213, + "grad_norm": 0.004623972810804844, + "learning_rate": 0.21889512324786342, + "loss": 0.1207, + "num_input_tokens_seen": 12596352, + "step": 13925 + }, + { + "epoch": 3.67652105054771, + "grad_norm": 0.002324337838217616, + "learning_rate": 0.21884279387960345, + "loss": 0.1125, + "num_input_tokens_seen": 12601376, + "step": 13930 + }, + { + "epoch": 3.677840834103207, + "grad_norm": 0.0028427571523934603, + "learning_rate": 0.2187904538949195, + "loss": 0.1109, + "num_input_tokens_seen": 12605792, + "step": 13935 + }, + { + "epoch": 3.679160617658704, + "grad_norm": 0.003253603121265769, + "learning_rate": 0.2187381033018831, + "loss": 0.0919, + "num_input_tokens_seen": 12610560, + "step": 13940 + }, + { + "epoch": 3.6804804012142007, + "grad_norm": 0.0005792190204374492, + "learning_rate": 0.2186857421085673, + "loss": 0.0893, + "num_input_tokens_seen": 12615136, + "step": 13945 + }, + { + "epoch": 3.6818001847696977, + "grad_norm": 0.0023675011470913887, + "learning_rate": 0.21863337032304697, + "loss": 0.0789, + "num_input_tokens_seen": 12619360, + "step": 13950 + }, + { + "epoch": 3.6831199683251947, + "grad_norm": 0.0017459971131756902, + "learning_rate": 0.21858098795339845, + "loss": 0.0686, + "num_input_tokens_seen": 12623520, + "step": 13955 + }, + { + "epoch": 3.6844397518806917, + "grad_norm": 0.002538591157644987, + "learning_rate": 0.21852859500769975, + "loss": 0.1064, + "num_input_tokens_seen": 12628128, + "step": 13960 + }, + { + "epoch": 3.6857595354361887, + "grad_norm": 0.0016069011762738228, + "learning_rate": 0.21847619149403044, + "loss": 0.0609, + "num_input_tokens_seen": 12632672, + "step": 13965 + }, + { + "epoch": 3.6870793189916853, + "grad_norm": 0.0010659790132194757, + "learning_rate": 0.21842377742047195, + "loss": 0.1011, + "num_input_tokens_seen": 12637312, + "step": 13970 + }, + { + "epoch": 3.6883991025471823, + "grad_norm": 0.001748888986185193, + "learning_rate": 0.21837135279510705, + "loss": 0.1741, + "num_input_tokens_seen": 12641760, + "step": 13975 + }, + { + "epoch": 3.6897188861026793, + "grad_norm": 0.0016882625641301274, + "learning_rate": 0.21831891762602038, + "loss": 0.0901, + "num_input_tokens_seen": 12646432, + "step": 13980 + }, + { + "epoch": 3.691038669658176, + "grad_norm": 0.0020355756860226393, + "learning_rate": 0.21826647192129806, + "loss": 0.1254, + "num_input_tokens_seen": 12650752, + "step": 13985 + }, + { + "epoch": 3.692358453213673, + "grad_norm": 0.0026957655791193247, + "learning_rate": 0.21821401568902787, + "loss": 0.1074, + "num_input_tokens_seen": 12655456, + "step": 13990 + }, + { + "epoch": 3.69367823676917, + "grad_norm": 0.001476234756410122, + "learning_rate": 0.21816154893729925, + "loss": 0.0791, + "num_input_tokens_seen": 12660224, + "step": 13995 + }, + { + "epoch": 3.694998020324667, + "grad_norm": 0.0022351101506501436, + "learning_rate": 0.2181090716742032, + "loss": 0.1347, + "num_input_tokens_seen": 12664768, + "step": 14000 + }, + { + "epoch": 3.694998020324667, + "eval_loss": 0.1007077693939209, + "eval_runtime": 75.8572, + "eval_samples_per_second": 88.785, + "eval_steps_per_second": 22.2, + "num_input_tokens_seen": 12664768, + "step": 14000 + }, + { + "epoch": 3.696317803880164, + "grad_norm": 0.0031585367396473885, + "learning_rate": 0.21805658390783236, + "loss": 0.1379, + "num_input_tokens_seen": 12669632, + "step": 14005 + }, + { + "epoch": 3.6976375874356604, + "grad_norm": 0.0012923722388222814, + "learning_rate": 0.21800408564628107, + "loss": 0.0853, + "num_input_tokens_seen": 12674016, + "step": 14010 + }, + { + "epoch": 3.6989573709911574, + "grad_norm": 0.0032944323029369116, + "learning_rate": 0.21795157689764516, + "loss": 0.0887, + "num_input_tokens_seen": 12678528, + "step": 14015 + }, + { + "epoch": 3.7002771545466544, + "grad_norm": 0.0024438081309199333, + "learning_rate": 0.21789905767002216, + "loss": 0.0799, + "num_input_tokens_seen": 12683040, + "step": 14020 + }, + { + "epoch": 3.7015969381021514, + "grad_norm": 0.0011897757649421692, + "learning_rate": 0.2178465279715112, + "loss": 0.0669, + "num_input_tokens_seen": 12687552, + "step": 14025 + }, + { + "epoch": 3.7029167216576484, + "grad_norm": 0.0007671374478377402, + "learning_rate": 0.21779398781021303, + "loss": 0.0679, + "num_input_tokens_seen": 12692352, + "step": 14030 + }, + { + "epoch": 3.704236505213145, + "grad_norm": 0.0010504659730941057, + "learning_rate": 0.21774143719422998, + "loss": 0.0731, + "num_input_tokens_seen": 12697024, + "step": 14035 + }, + { + "epoch": 3.705556288768642, + "grad_norm": 0.002274894854053855, + "learning_rate": 0.21768887613166601, + "loss": 0.0867, + "num_input_tokens_seen": 12701888, + "step": 14040 + }, + { + "epoch": 3.706876072324139, + "grad_norm": 0.0006622325745411217, + "learning_rate": 0.2176363046306267, + "loss": 0.0795, + "num_input_tokens_seen": 12706464, + "step": 14045 + }, + { + "epoch": 3.7081958558796355, + "grad_norm": 0.0016897174064069986, + "learning_rate": 0.21758372269921925, + "loss": 0.0731, + "num_input_tokens_seen": 12711136, + "step": 14050 + }, + { + "epoch": 3.7095156394351325, + "grad_norm": 0.001199804712086916, + "learning_rate": 0.21753113034555244, + "loss": 0.0766, + "num_input_tokens_seen": 12715552, + "step": 14055 + }, + { + "epoch": 3.7108354229906295, + "grad_norm": 0.0021780782844871283, + "learning_rate": 0.2174785275777367, + "loss": 0.1136, + "num_input_tokens_seen": 12720160, + "step": 14060 + }, + { + "epoch": 3.7121552065461265, + "grad_norm": 0.0016077341279014945, + "learning_rate": 0.21742591440388404, + "loss": 0.097, + "num_input_tokens_seen": 12724448, + "step": 14065 + }, + { + "epoch": 3.7134749901016235, + "grad_norm": 0.002912525786086917, + "learning_rate": 0.21737329083210802, + "loss": 0.0877, + "num_input_tokens_seen": 12728960, + "step": 14070 + }, + { + "epoch": 3.71479477365712, + "grad_norm": 0.00047376335714943707, + "learning_rate": 0.2173206568705239, + "loss": 0.0631, + "num_input_tokens_seen": 12733408, + "step": 14075 + }, + { + "epoch": 3.716114557212617, + "grad_norm": 0.002751704305410385, + "learning_rate": 0.2172680125272485, + "loss": 0.0818, + "num_input_tokens_seen": 12737952, + "step": 14080 + }, + { + "epoch": 3.717434340768114, + "grad_norm": 0.0022919881157577038, + "learning_rate": 0.2172153578104002, + "loss": 0.0641, + "num_input_tokens_seen": 12742336, + "step": 14085 + }, + { + "epoch": 3.718754124323611, + "grad_norm": 0.0007525170804001391, + "learning_rate": 0.21716269272809902, + "loss": 0.0532, + "num_input_tokens_seen": 12746848, + "step": 14090 + }, + { + "epoch": 3.720073907879108, + "grad_norm": 0.001933772349730134, + "learning_rate": 0.21711001728846666, + "loss": 0.1268, + "num_input_tokens_seen": 12751520, + "step": 14095 + }, + { + "epoch": 3.7213936914346046, + "grad_norm": 0.003799251513555646, + "learning_rate": 0.21705733149962628, + "loss": 0.1093, + "num_input_tokens_seen": 12756160, + "step": 14100 + }, + { + "epoch": 3.7227134749901016, + "grad_norm": 0.00364253344014287, + "learning_rate": 0.21700463536970263, + "loss": 0.1312, + "num_input_tokens_seen": 12760320, + "step": 14105 + }, + { + "epoch": 3.7240332585455986, + "grad_norm": 0.0003963171620853245, + "learning_rate": 0.21695192890682222, + "loss": 0.0855, + "num_input_tokens_seen": 12764704, + "step": 14110 + }, + { + "epoch": 3.725353042101095, + "grad_norm": 0.00180200079921633, + "learning_rate": 0.21689921211911298, + "loss": 0.0823, + "num_input_tokens_seen": 12769152, + "step": 14115 + }, + { + "epoch": 3.726672825656592, + "grad_norm": 0.0017203963361680508, + "learning_rate": 0.21684648501470452, + "loss": 0.0993, + "num_input_tokens_seen": 12773632, + "step": 14120 + }, + { + "epoch": 3.727992609212089, + "grad_norm": 0.0018040823051705956, + "learning_rate": 0.216793747601728, + "loss": 0.0765, + "num_input_tokens_seen": 12778016, + "step": 14125 + }, + { + "epoch": 3.729312392767586, + "grad_norm": 0.001865898142568767, + "learning_rate": 0.21674099988831627, + "loss": 0.104, + "num_input_tokens_seen": 12782624, + "step": 14130 + }, + { + "epoch": 3.730632176323083, + "grad_norm": 0.0025035045109689236, + "learning_rate": 0.21668824188260363, + "loss": 0.0889, + "num_input_tokens_seen": 12787360, + "step": 14135 + }, + { + "epoch": 3.7319519598785797, + "grad_norm": 0.0026364855002611876, + "learning_rate": 0.21663547359272606, + "loss": 0.1051, + "num_input_tokens_seen": 12791808, + "step": 14140 + }, + { + "epoch": 3.7332717434340768, + "grad_norm": 0.0017351401038467884, + "learning_rate": 0.216582695026821, + "loss": 0.0921, + "num_input_tokens_seen": 12796224, + "step": 14145 + }, + { + "epoch": 3.7345915269895738, + "grad_norm": 0.0033312595915049314, + "learning_rate": 0.21652990619302767, + "loss": 0.1088, + "num_input_tokens_seen": 12800480, + "step": 14150 + }, + { + "epoch": 3.7359113105450708, + "grad_norm": 0.0014338571345433593, + "learning_rate": 0.21647710709948673, + "loss": 0.0812, + "num_input_tokens_seen": 12805216, + "step": 14155 + }, + { + "epoch": 3.7372310941005678, + "grad_norm": 0.0024254980962723494, + "learning_rate": 0.2164242977543405, + "loss": 0.1041, + "num_input_tokens_seen": 12809600, + "step": 14160 + }, + { + "epoch": 3.7385508776560643, + "grad_norm": 0.002334217308089137, + "learning_rate": 0.21637147816573277, + "loss": 0.1397, + "num_input_tokens_seen": 12814592, + "step": 14165 + }, + { + "epoch": 3.7398706612115613, + "grad_norm": 0.0019426411017775536, + "learning_rate": 0.21631864834180908, + "loss": 0.0847, + "num_input_tokens_seen": 12819104, + "step": 14170 + }, + { + "epoch": 3.7411904447670583, + "grad_norm": 0.00240137055516243, + "learning_rate": 0.21626580829071637, + "loss": 0.1219, + "num_input_tokens_seen": 12823392, + "step": 14175 + }, + { + "epoch": 3.742510228322555, + "grad_norm": 0.002446336904540658, + "learning_rate": 0.21621295802060328, + "loss": 0.0825, + "num_input_tokens_seen": 12827584, + "step": 14180 + }, + { + "epoch": 3.743830011878052, + "grad_norm": 0.0019033602438867092, + "learning_rate": 0.21616009753961996, + "loss": 0.112, + "num_input_tokens_seen": 12832096, + "step": 14185 + }, + { + "epoch": 3.745149795433549, + "grad_norm": 0.002431324450299144, + "learning_rate": 0.2161072268559182, + "loss": 0.0761, + "num_input_tokens_seen": 12836768, + "step": 14190 + }, + { + "epoch": 3.746469578989046, + "grad_norm": 0.0027777040377259254, + "learning_rate": 0.21605434597765133, + "loss": 0.1241, + "num_input_tokens_seen": 12841376, + "step": 14195 + }, + { + "epoch": 3.747789362544543, + "grad_norm": 0.0017686366336420178, + "learning_rate": 0.21600145491297418, + "loss": 0.1077, + "num_input_tokens_seen": 12845696, + "step": 14200 + }, + { + "epoch": 3.747789362544543, + "eval_loss": 0.09821461141109467, + "eval_runtime": 75.8121, + "eval_samples_per_second": 88.838, + "eval_steps_per_second": 22.213, + "num_input_tokens_seen": 12845696, + "step": 14200 + }, + { + "epoch": 3.7491091461000394, + "grad_norm": 0.002565725240856409, + "learning_rate": 0.21594855367004326, + "loss": 0.0823, + "num_input_tokens_seen": 12850464, + "step": 14205 + }, + { + "epoch": 3.7504289296555364, + "grad_norm": 0.0023992378264665604, + "learning_rate": 0.21589564225701663, + "loss": 0.0732, + "num_input_tokens_seen": 12854752, + "step": 14210 + }, + { + "epoch": 3.7517487132110334, + "grad_norm": 0.0027770441956818104, + "learning_rate": 0.21584272068205385, + "loss": 0.1084, + "num_input_tokens_seen": 12859392, + "step": 14215 + }, + { + "epoch": 3.7530684967665304, + "grad_norm": 0.0008042986155487597, + "learning_rate": 0.2157897889533161, + "loss": 0.0802, + "num_input_tokens_seen": 12864160, + "step": 14220 + }, + { + "epoch": 3.7543882803220274, + "grad_norm": 0.0018990634707733989, + "learning_rate": 0.21573684707896612, + "loss": 0.0858, + "num_input_tokens_seen": 12868608, + "step": 14225 + }, + { + "epoch": 3.755708063877524, + "grad_norm": 0.0016103272791951895, + "learning_rate": 0.21568389506716826, + "loss": 0.0825, + "num_input_tokens_seen": 12873248, + "step": 14230 + }, + { + "epoch": 3.757027847433021, + "grad_norm": 0.002545631490647793, + "learning_rate": 0.21563093292608831, + "loss": 0.0873, + "num_input_tokens_seen": 12877696, + "step": 14235 + }, + { + "epoch": 3.758347630988518, + "grad_norm": 0.0020392213482409716, + "learning_rate": 0.21557796066389376, + "loss": 0.0872, + "num_input_tokens_seen": 12882464, + "step": 14240 + }, + { + "epoch": 3.7596674145440145, + "grad_norm": 0.004045067820698023, + "learning_rate": 0.21552497828875353, + "loss": 0.1166, + "num_input_tokens_seen": 12886656, + "step": 14245 + }, + { + "epoch": 3.7609871980995115, + "grad_norm": 0.0016937596956267953, + "learning_rate": 0.21547198580883828, + "loss": 0.0782, + "num_input_tokens_seen": 12891136, + "step": 14250 + }, + { + "epoch": 3.7623069816550085, + "grad_norm": 0.003384212264791131, + "learning_rate": 0.21541898323232, + "loss": 0.0979, + "num_input_tokens_seen": 12895744, + "step": 14255 + }, + { + "epoch": 3.7636267652105055, + "grad_norm": 0.0004205000295769423, + "learning_rate": 0.2153659705673724, + "loss": 0.0727, + "num_input_tokens_seen": 12900096, + "step": 14260 + }, + { + "epoch": 3.7649465487660025, + "grad_norm": 0.0018556936411187053, + "learning_rate": 0.2153129478221707, + "loss": 0.0844, + "num_input_tokens_seen": 12904704, + "step": 14265 + }, + { + "epoch": 3.766266332321499, + "grad_norm": 0.0011711023980751634, + "learning_rate": 0.21525991500489164, + "loss": 0.0766, + "num_input_tokens_seen": 12909280, + "step": 14270 + }, + { + "epoch": 3.767586115876996, + "grad_norm": 0.0009973642881959677, + "learning_rate": 0.21520687212371362, + "loss": 0.065, + "num_input_tokens_seen": 12914080, + "step": 14275 + }, + { + "epoch": 3.768905899432493, + "grad_norm": 0.002288956893607974, + "learning_rate": 0.21515381918681648, + "loss": 0.0589, + "num_input_tokens_seen": 12918528, + "step": 14280 + }, + { + "epoch": 3.77022568298799, + "grad_norm": 0.0021905472967773676, + "learning_rate": 0.21510075620238167, + "loss": 0.0992, + "num_input_tokens_seen": 12922784, + "step": 14285 + }, + { + "epoch": 3.771545466543487, + "grad_norm": 0.0030622039921581745, + "learning_rate": 0.21504768317859208, + "loss": 0.1341, + "num_input_tokens_seen": 12927168, + "step": 14290 + }, + { + "epoch": 3.7728652500989837, + "grad_norm": 0.0018589695682749152, + "learning_rate": 0.2149946001236323, + "loss": 0.1083, + "num_input_tokens_seen": 12931648, + "step": 14295 + }, + { + "epoch": 3.7741850336544807, + "grad_norm": 0.002149982610717416, + "learning_rate": 0.21494150704568848, + "loss": 0.1082, + "num_input_tokens_seen": 12936416, + "step": 14300 + }, + { + "epoch": 3.7755048172099777, + "grad_norm": 0.0020791750866919756, + "learning_rate": 0.21488840395294811, + "loss": 0.0805, + "num_input_tokens_seen": 12940672, + "step": 14305 + }, + { + "epoch": 3.776824600765474, + "grad_norm": 0.0005691720289178193, + "learning_rate": 0.21483529085360042, + "loss": 0.0563, + "num_input_tokens_seen": 12945440, + "step": 14310 + }, + { + "epoch": 3.778144384320971, + "grad_norm": 0.0011425584089010954, + "learning_rate": 0.2147821677558361, + "loss": 0.1064, + "num_input_tokens_seen": 12949984, + "step": 14315 + }, + { + "epoch": 3.779464167876468, + "grad_norm": 0.002593591110780835, + "learning_rate": 0.2147290346678475, + "loss": 0.1039, + "num_input_tokens_seen": 12954656, + "step": 14320 + }, + { + "epoch": 3.780783951431965, + "grad_norm": 0.0014524831203743815, + "learning_rate": 0.21467589159782827, + "loss": 0.0615, + "num_input_tokens_seen": 12959328, + "step": 14325 + }, + { + "epoch": 3.782103734987462, + "grad_norm": 0.0017910203896462917, + "learning_rate": 0.21462273855397374, + "loss": 0.0779, + "num_input_tokens_seen": 12963584, + "step": 14330 + }, + { + "epoch": 3.7834235185429588, + "grad_norm": 0.0015136237489059567, + "learning_rate": 0.21456957554448083, + "loss": 0.1413, + "num_input_tokens_seen": 12968000, + "step": 14335 + }, + { + "epoch": 3.7847433020984558, + "grad_norm": 0.001541885663755238, + "learning_rate": 0.21451640257754795, + "loss": 0.0773, + "num_input_tokens_seen": 12972224, + "step": 14340 + }, + { + "epoch": 3.7860630856539528, + "grad_norm": 0.0010845212964341044, + "learning_rate": 0.21446321966137508, + "loss": 0.0841, + "num_input_tokens_seen": 12976544, + "step": 14345 + }, + { + "epoch": 3.7873828692094498, + "grad_norm": 0.0022984833922237158, + "learning_rate": 0.21441002680416354, + "loss": 0.0761, + "num_input_tokens_seen": 12981120, + "step": 14350 + }, + { + "epoch": 3.7887026527649468, + "grad_norm": 0.0029049983713775873, + "learning_rate": 0.21435682401411654, + "loss": 0.1584, + "num_input_tokens_seen": 12985504, + "step": 14355 + }, + { + "epoch": 3.7900224363204433, + "grad_norm": 0.002281011315062642, + "learning_rate": 0.2143036112994385, + "loss": 0.0598, + "num_input_tokens_seen": 12990272, + "step": 14360 + }, + { + "epoch": 3.7913422198759403, + "grad_norm": 0.0017775228479877114, + "learning_rate": 0.21425038866833548, + "loss": 0.0847, + "num_input_tokens_seen": 12994688, + "step": 14365 + }, + { + "epoch": 3.7926620034314373, + "grad_norm": 0.0015123222256079316, + "learning_rate": 0.21419715612901508, + "loss": 0.0863, + "num_input_tokens_seen": 12999072, + "step": 14370 + }, + { + "epoch": 3.793981786986934, + "grad_norm": 0.0029994486831128597, + "learning_rate": 0.21414391368968652, + "loss": 0.1189, + "num_input_tokens_seen": 13003872, + "step": 14375 + }, + { + "epoch": 3.795301570542431, + "grad_norm": 0.0024862969294190407, + "learning_rate": 0.21409066135856034, + "loss": 0.0975, + "num_input_tokens_seen": 13008512, + "step": 14380 + }, + { + "epoch": 3.796621354097928, + "grad_norm": 0.0018012874061241746, + "learning_rate": 0.21403739914384878, + "loss": 0.0748, + "num_input_tokens_seen": 13012960, + "step": 14385 + }, + { + "epoch": 3.797941137653425, + "grad_norm": 0.0013834385899826884, + "learning_rate": 0.21398412705376554, + "loss": 0.0867, + "num_input_tokens_seen": 13017600, + "step": 14390 + }, + { + "epoch": 3.799260921208922, + "grad_norm": 0.0024361885152757168, + "learning_rate": 0.2139308450965258, + "loss": 0.0834, + "num_input_tokens_seen": 13022080, + "step": 14395 + }, + { + "epoch": 3.8005807047644184, + "grad_norm": 0.0019122925586998463, + "learning_rate": 0.21387755328034638, + "loss": 0.1079, + "num_input_tokens_seen": 13026720, + "step": 14400 + }, + { + "epoch": 3.8005807047644184, + "eval_loss": 0.09737921506166458, + "eval_runtime": 75.9394, + "eval_samples_per_second": 88.689, + "eval_steps_per_second": 22.176, + "num_input_tokens_seen": 13026720, + "step": 14400 + }, + { + "epoch": 3.8019004883199154, + "grad_norm": 0.0029960214160382748, + "learning_rate": 0.2138242516134455, + "loss": 0.0717, + "num_input_tokens_seen": 13031488, + "step": 14405 + }, + { + "epoch": 3.8032202718754125, + "grad_norm": 0.0037117216270416975, + "learning_rate": 0.2137709401040429, + "loss": 0.0755, + "num_input_tokens_seen": 13035936, + "step": 14410 + }, + { + "epoch": 3.8045400554309095, + "grad_norm": 0.0012244052486494184, + "learning_rate": 0.21371761876036, + "loss": 0.1075, + "num_input_tokens_seen": 13040096, + "step": 14415 + }, + { + "epoch": 3.8058598389864065, + "grad_norm": 0.002360421232879162, + "learning_rate": 0.21366428759061956, + "loss": 0.1076, + "num_input_tokens_seen": 13044544, + "step": 14420 + }, + { + "epoch": 3.807179622541903, + "grad_norm": 0.001842251978814602, + "learning_rate": 0.2136109466030459, + "loss": 0.081, + "num_input_tokens_seen": 13049440, + "step": 14425 + }, + { + "epoch": 3.8084994060974, + "grad_norm": 0.0008570817881263793, + "learning_rate": 0.2135575958058649, + "loss": 0.0978, + "num_input_tokens_seen": 13053888, + "step": 14430 + }, + { + "epoch": 3.809819189652897, + "grad_norm": 0.0011351491557434201, + "learning_rate": 0.2135042352073039, + "loss": 0.0994, + "num_input_tokens_seen": 13058432, + "step": 14435 + }, + { + "epoch": 3.8111389732083936, + "grad_norm": 0.0016814840491861105, + "learning_rate": 0.2134508648155918, + "loss": 0.105, + "num_input_tokens_seen": 13063040, + "step": 14440 + }, + { + "epoch": 3.8124587567638906, + "grad_norm": 0.002202327363193035, + "learning_rate": 0.213397484638959, + "loss": 0.1192, + "num_input_tokens_seen": 13067776, + "step": 14445 + }, + { + "epoch": 3.8137785403193876, + "grad_norm": 0.0014528275933116674, + "learning_rate": 0.21334409468563728, + "loss": 0.0964, + "num_input_tokens_seen": 13072160, + "step": 14450 + }, + { + "epoch": 3.8150983238748846, + "grad_norm": 0.0030940657015889883, + "learning_rate": 0.2132906949638602, + "loss": 0.1287, + "num_input_tokens_seen": 13076800, + "step": 14455 + }, + { + "epoch": 3.8164181074303816, + "grad_norm": 0.0018331039464101195, + "learning_rate": 0.21323728548186255, + "loss": 0.0991, + "num_input_tokens_seen": 13081632, + "step": 14460 + }, + { + "epoch": 3.817737890985878, + "grad_norm": 0.001649472862482071, + "learning_rate": 0.21318386624788088, + "loss": 0.0696, + "num_input_tokens_seen": 13085920, + "step": 14465 + }, + { + "epoch": 3.819057674541375, + "grad_norm": 0.0015310058370232582, + "learning_rate": 0.21313043727015288, + "loss": 0.0556, + "num_input_tokens_seen": 13090432, + "step": 14470 + }, + { + "epoch": 3.820377458096872, + "grad_norm": 0.0005022802506573498, + "learning_rate": 0.2130769985569182, + "loss": 0.0615, + "num_input_tokens_seen": 13094912, + "step": 14475 + }, + { + "epoch": 3.821697241652369, + "grad_norm": 0.0021463166922330856, + "learning_rate": 0.21302355011641766, + "loss": 0.0599, + "num_input_tokens_seen": 13099648, + "step": 14480 + }, + { + "epoch": 3.823017025207866, + "grad_norm": 0.001782106002792716, + "learning_rate": 0.21297009195689365, + "loss": 0.0996, + "num_input_tokens_seen": 13104256, + "step": 14485 + }, + { + "epoch": 3.8243368087633627, + "grad_norm": 0.00341438758186996, + "learning_rate": 0.21291662408659015, + "loss": 0.1421, + "num_input_tokens_seen": 13108992, + "step": 14490 + }, + { + "epoch": 3.8256565923188597, + "grad_norm": 0.0004315595142543316, + "learning_rate": 0.21286314651375254, + "loss": 0.0628, + "num_input_tokens_seen": 13113792, + "step": 14495 + }, + { + "epoch": 3.8269763758743567, + "grad_norm": 0.0031023211777210236, + "learning_rate": 0.2128096592466278, + "loss": 0.102, + "num_input_tokens_seen": 13118592, + "step": 14500 + }, + { + "epoch": 3.8282961594298532, + "grad_norm": 0.0027095063123852015, + "learning_rate": 0.21275616229346428, + "loss": 0.0946, + "num_input_tokens_seen": 13122976, + "step": 14505 + }, + { + "epoch": 3.8296159429853502, + "grad_norm": 0.003336968133226037, + "learning_rate": 0.21270265566251184, + "loss": 0.1341, + "num_input_tokens_seen": 13127520, + "step": 14510 + }, + { + "epoch": 3.8309357265408472, + "grad_norm": 0.0010083084926009178, + "learning_rate": 0.21264913936202193, + "loss": 0.079, + "num_input_tokens_seen": 13131872, + "step": 14515 + }, + { + "epoch": 3.8322555100963442, + "grad_norm": 0.0022546558175235987, + "learning_rate": 0.2125956134002475, + "loss": 0.1007, + "num_input_tokens_seen": 13136384, + "step": 14520 + }, + { + "epoch": 3.8335752936518412, + "grad_norm": 0.0033082766458392143, + "learning_rate": 0.2125420777854428, + "loss": 0.092, + "num_input_tokens_seen": 13141152, + "step": 14525 + }, + { + "epoch": 3.834895077207338, + "grad_norm": 0.0037200774531811476, + "learning_rate": 0.21248853252586372, + "loss": 0.0937, + "num_input_tokens_seen": 13145760, + "step": 14530 + }, + { + "epoch": 3.836214860762835, + "grad_norm": 0.0014523972058668733, + "learning_rate": 0.21243497762976774, + "loss": 0.0865, + "num_input_tokens_seen": 13149952, + "step": 14535 + }, + { + "epoch": 3.837534644318332, + "grad_norm": 0.0016680503031238914, + "learning_rate": 0.21238141310541356, + "loss": 0.0736, + "num_input_tokens_seen": 13154272, + "step": 14540 + }, + { + "epoch": 3.838854427873829, + "grad_norm": 0.002318853745236993, + "learning_rate": 0.21232783896106153, + "loss": 0.0614, + "num_input_tokens_seen": 13159008, + "step": 14545 + }, + { + "epoch": 3.840174211429326, + "grad_norm": 0.001707285875454545, + "learning_rate": 0.21227425520497345, + "loss": 0.1039, + "num_input_tokens_seen": 13163424, + "step": 14550 + }, + { + "epoch": 3.8414939949848224, + "grad_norm": 0.002285919152200222, + "learning_rate": 0.2122206618454127, + "loss": 0.075, + "num_input_tokens_seen": 13167712, + "step": 14555 + }, + { + "epoch": 3.8428137785403194, + "grad_norm": 0.002420698991045356, + "learning_rate": 0.2121670588906439, + "loss": 0.1069, + "num_input_tokens_seen": 13172160, + "step": 14560 + }, + { + "epoch": 3.8441335620958164, + "grad_norm": 0.0030243333894759417, + "learning_rate": 0.21211344634893345, + "loss": 0.0872, + "num_input_tokens_seen": 13176608, + "step": 14565 + }, + { + "epoch": 3.845453345651313, + "grad_norm": 0.0015787980519235134, + "learning_rate": 0.21205982422854897, + "loss": 0.0794, + "num_input_tokens_seen": 13180960, + "step": 14570 + }, + { + "epoch": 3.8467731292068104, + "grad_norm": 0.002638098318129778, + "learning_rate": 0.21200619253775974, + "loss": 0.1169, + "num_input_tokens_seen": 13185504, + "step": 14575 + }, + { + "epoch": 3.848092912762307, + "grad_norm": 0.002542914357036352, + "learning_rate": 0.21195255128483637, + "loss": 0.1458, + "num_input_tokens_seen": 13189824, + "step": 14580 + }, + { + "epoch": 3.849412696317804, + "grad_norm": 0.002090641064569354, + "learning_rate": 0.21189890047805102, + "loss": 0.0759, + "num_input_tokens_seen": 13194432, + "step": 14585 + }, + { + "epoch": 3.850732479873301, + "grad_norm": 0.0015661421930417418, + "learning_rate": 0.21184524012567735, + "loss": 0.1126, + "num_input_tokens_seen": 13199008, + "step": 14590 + }, + { + "epoch": 3.8520522634287975, + "grad_norm": 0.001968379830941558, + "learning_rate": 0.2117915702359905, + "loss": 0.0902, + "num_input_tokens_seen": 13203392, + "step": 14595 + }, + { + "epoch": 3.8533720469842945, + "grad_norm": 0.001251296023838222, + "learning_rate": 0.211737890817267, + "loss": 0.0745, + "num_input_tokens_seen": 13207904, + "step": 14600 + }, + { + "epoch": 3.8533720469842945, + "eval_loss": 0.09754931181669235, + "eval_runtime": 75.7865, + "eval_samples_per_second": 88.868, + "eval_steps_per_second": 22.22, + "num_input_tokens_seen": 13207904, + "step": 14600 + }, + { + "epoch": 3.8546918305397915, + "grad_norm": 0.002692712703719735, + "learning_rate": 0.21168420187778483, + "loss": 0.0755, + "num_input_tokens_seen": 13212960, + "step": 14605 + }, + { + "epoch": 3.8560116140952885, + "grad_norm": 0.0028353866655379534, + "learning_rate": 0.21163050342582362, + "loss": 0.1072, + "num_input_tokens_seen": 13217728, + "step": 14610 + }, + { + "epoch": 3.8573313976507855, + "grad_norm": 0.0019025432411581278, + "learning_rate": 0.21157679546966426, + "loss": 0.051, + "num_input_tokens_seen": 13222016, + "step": 14615 + }, + { + "epoch": 3.858651181206282, + "grad_norm": 0.0020416409242898226, + "learning_rate": 0.2115230780175892, + "loss": 0.0827, + "num_input_tokens_seen": 13226560, + "step": 14620 + }, + { + "epoch": 3.859970964761779, + "grad_norm": 0.000998918665573001, + "learning_rate": 0.21146935107788237, + "loss": 0.0947, + "num_input_tokens_seen": 13230944, + "step": 14625 + }, + { + "epoch": 3.861290748317276, + "grad_norm": 0.0019469140097498894, + "learning_rate": 0.21141561465882916, + "loss": 0.1036, + "num_input_tokens_seen": 13235584, + "step": 14630 + }, + { + "epoch": 3.8626105318727726, + "grad_norm": 0.0010230507468804717, + "learning_rate": 0.21136186876871635, + "loss": 0.0744, + "num_input_tokens_seen": 13240256, + "step": 14635 + }, + { + "epoch": 3.86393031542827, + "grad_norm": 0.0018590608378872275, + "learning_rate": 0.21130811341583225, + "loss": 0.0702, + "num_input_tokens_seen": 13244832, + "step": 14640 + }, + { + "epoch": 3.8652500989837666, + "grad_norm": 0.0008694605203345418, + "learning_rate": 0.21125434860846667, + "loss": 0.0495, + "num_input_tokens_seen": 13249280, + "step": 14645 + }, + { + "epoch": 3.8665698825392636, + "grad_norm": 0.0027170577086508274, + "learning_rate": 0.2112005743549107, + "loss": 0.0636, + "num_input_tokens_seen": 13254080, + "step": 14650 + }, + { + "epoch": 3.8678896660947606, + "grad_norm": 0.002329431474208832, + "learning_rate": 0.21114679066345707, + "loss": 0.1023, + "num_input_tokens_seen": 13258592, + "step": 14655 + }, + { + "epoch": 3.869209449650257, + "grad_norm": 0.0017363084480166435, + "learning_rate": 0.21109299754239993, + "loss": 0.1273, + "num_input_tokens_seen": 13262720, + "step": 14660 + }, + { + "epoch": 3.870529233205754, + "grad_norm": 0.0005146227777004242, + "learning_rate": 0.21103919500003482, + "loss": 0.0822, + "num_input_tokens_seen": 13267392, + "step": 14665 + }, + { + "epoch": 3.871849016761251, + "grad_norm": 0.002385832602158189, + "learning_rate": 0.21098538304465872, + "loss": 0.0907, + "num_input_tokens_seen": 13272288, + "step": 14670 + }, + { + "epoch": 3.873168800316748, + "grad_norm": 0.0011226751375943422, + "learning_rate": 0.2109315616845702, + "loss": 0.0738, + "num_input_tokens_seen": 13276736, + "step": 14675 + }, + { + "epoch": 3.874488583872245, + "grad_norm": 0.0010806632926687598, + "learning_rate": 0.21087773092806925, + "loss": 0.0597, + "num_input_tokens_seen": 13281152, + "step": 14680 + }, + { + "epoch": 3.8758083674277417, + "grad_norm": 0.0026031637098640203, + "learning_rate": 0.21082389078345704, + "loss": 0.1046, + "num_input_tokens_seen": 13285664, + "step": 14685 + }, + { + "epoch": 3.8771281509832387, + "grad_norm": 0.0008948129252530634, + "learning_rate": 0.2107700412590365, + "loss": 0.0566, + "num_input_tokens_seen": 13290048, + "step": 14690 + }, + { + "epoch": 3.8784479345387357, + "grad_norm": 0.0024400479160249233, + "learning_rate": 0.210716182363112, + "loss": 0.1031, + "num_input_tokens_seen": 13294592, + "step": 14695 + }, + { + "epoch": 3.8797677180942327, + "grad_norm": 0.0027538733556866646, + "learning_rate": 0.2106623141039891, + "loss": 0.0718, + "num_input_tokens_seen": 13299040, + "step": 14700 + }, + { + "epoch": 3.8810875016497297, + "grad_norm": 0.0017725002253428102, + "learning_rate": 0.21060843648997507, + "loss": 0.0379, + "num_input_tokens_seen": 13303264, + "step": 14705 + }, + { + "epoch": 3.8824072852052263, + "grad_norm": 0.00321976188570261, + "learning_rate": 0.21055454952937844, + "loss": 0.0722, + "num_input_tokens_seen": 13307936, + "step": 14710 + }, + { + "epoch": 3.8837270687607233, + "grad_norm": 0.0011186215560883284, + "learning_rate": 0.21050065323050937, + "loss": 0.0614, + "num_input_tokens_seen": 13312288, + "step": 14715 + }, + { + "epoch": 3.8850468523162203, + "grad_norm": 0.002014175755903125, + "learning_rate": 0.21044674760167928, + "loss": 0.0897, + "num_input_tokens_seen": 13316736, + "step": 14720 + }, + { + "epoch": 3.886366635871717, + "grad_norm": 0.0012091356329619884, + "learning_rate": 0.210392832651201, + "loss": 0.1151, + "num_input_tokens_seen": 13321024, + "step": 14725 + }, + { + "epoch": 3.887686419427214, + "grad_norm": 0.0027313006576150656, + "learning_rate": 0.210338908387389, + "loss": 0.1447, + "num_input_tokens_seen": 13325568, + "step": 14730 + }, + { + "epoch": 3.889006202982711, + "grad_norm": 0.002413674257695675, + "learning_rate": 0.21028497481855912, + "loss": 0.0774, + "num_input_tokens_seen": 13330208, + "step": 14735 + }, + { + "epoch": 3.890325986538208, + "grad_norm": 0.0014348650583997369, + "learning_rate": 0.21023103195302847, + "loss": 0.1027, + "num_input_tokens_seen": 13334752, + "step": 14740 + }, + { + "epoch": 3.891645770093705, + "grad_norm": 0.0016367863863706589, + "learning_rate": 0.21017707979911582, + "loss": 0.11, + "num_input_tokens_seen": 13339328, + "step": 14745 + }, + { + "epoch": 3.8929655536492014, + "grad_norm": 0.0027204426005482674, + "learning_rate": 0.21012311836514122, + "loss": 0.0957, + "num_input_tokens_seen": 13343744, + "step": 14750 + }, + { + "epoch": 3.8942853372046984, + "grad_norm": 0.0032926772255450487, + "learning_rate": 0.21006914765942622, + "loss": 0.0793, + "num_input_tokens_seen": 13348352, + "step": 14755 + }, + { + "epoch": 3.8956051207601954, + "grad_norm": 0.0009267052519135177, + "learning_rate": 0.2100151676902938, + "loss": 0.075, + "num_input_tokens_seen": 13352672, + "step": 14760 + }, + { + "epoch": 3.8969249043156924, + "grad_norm": 0.0029822872020304203, + "learning_rate": 0.2099611784660683, + "loss": 0.0829, + "num_input_tokens_seen": 13357184, + "step": 14765 + }, + { + "epoch": 3.8982446878711894, + "grad_norm": 0.0017730547115206718, + "learning_rate": 0.20990717999507552, + "loss": 0.0827, + "num_input_tokens_seen": 13361920, + "step": 14770 + }, + { + "epoch": 3.899564471426686, + "grad_norm": 0.0018967031501233578, + "learning_rate": 0.20985317228564276, + "loss": 0.0931, + "num_input_tokens_seen": 13366656, + "step": 14775 + }, + { + "epoch": 3.900884254982183, + "grad_norm": 0.0013928621774539351, + "learning_rate": 0.20979915534609872, + "loss": 0.0899, + "num_input_tokens_seen": 13371264, + "step": 14780 + }, + { + "epoch": 3.90220403853768, + "grad_norm": 0.0017158612608909607, + "learning_rate": 0.20974512918477342, + "loss": 0.0843, + "num_input_tokens_seen": 13375616, + "step": 14785 + }, + { + "epoch": 3.9035238220931765, + "grad_norm": 0.0017767802346497774, + "learning_rate": 0.2096910938099984, + "loss": 0.0553, + "num_input_tokens_seen": 13380416, + "step": 14790 + }, + { + "epoch": 3.9048436056486735, + "grad_norm": 0.003115543397143483, + "learning_rate": 0.2096370492301066, + "loss": 0.0768, + "num_input_tokens_seen": 13384896, + "step": 14795 + }, + { + "epoch": 3.9061633892041705, + "grad_norm": 0.0015413847286254168, + "learning_rate": 0.2095829954534323, + "loss": 0.0452, + "num_input_tokens_seen": 13389408, + "step": 14800 + }, + { + "epoch": 3.9061633892041705, + "eval_loss": 0.09680826961994171, + "eval_runtime": 75.9488, + "eval_samples_per_second": 88.678, + "eval_steps_per_second": 22.173, + "num_input_tokens_seen": 13389408, + "step": 14800 + }, + { + "epoch": 3.9074831727596675, + "grad_norm": 0.0017867572605609894, + "learning_rate": 0.2095289324883114, + "loss": 0.1134, + "num_input_tokens_seen": 13393824, + "step": 14805 + }, + { + "epoch": 3.9088029563151645, + "grad_norm": 0.0013162739342078567, + "learning_rate": 0.20947486034308097, + "loss": 0.0657, + "num_input_tokens_seen": 13398048, + "step": 14810 + }, + { + "epoch": 3.910122739870661, + "grad_norm": 0.0014142530271783471, + "learning_rate": 0.2094207790260797, + "loss": 0.0855, + "num_input_tokens_seen": 13402304, + "step": 14815 + }, + { + "epoch": 3.911442523426158, + "grad_norm": 0.002527160570025444, + "learning_rate": 0.20936668854564758, + "loss": 0.0882, + "num_input_tokens_seen": 13406720, + "step": 14820 + }, + { + "epoch": 3.912762306981655, + "grad_norm": 0.0011959756957367063, + "learning_rate": 0.20931258891012602, + "loss": 0.0637, + "num_input_tokens_seen": 13411072, + "step": 14825 + }, + { + "epoch": 3.914082090537152, + "grad_norm": 0.002348797395825386, + "learning_rate": 0.20925848012785792, + "loss": 0.1108, + "num_input_tokens_seen": 13415680, + "step": 14830 + }, + { + "epoch": 3.915401874092649, + "grad_norm": 0.000776077329646796, + "learning_rate": 0.20920436220718747, + "loss": 0.071, + "num_input_tokens_seen": 13420064, + "step": 14835 + }, + { + "epoch": 3.9167216576481456, + "grad_norm": 0.002378988079726696, + "learning_rate": 0.20915023515646033, + "loss": 0.1171, + "num_input_tokens_seen": 13424800, + "step": 14840 + }, + { + "epoch": 3.9180414412036426, + "grad_norm": 0.0018637346802279353, + "learning_rate": 0.20909609898402368, + "loss": 0.1015, + "num_input_tokens_seen": 13429504, + "step": 14845 + }, + { + "epoch": 3.9193612247591396, + "grad_norm": 0.000713646353688091, + "learning_rate": 0.2090419536982258, + "loss": 0.084, + "num_input_tokens_seen": 13433600, + "step": 14850 + }, + { + "epoch": 3.920681008314636, + "grad_norm": 0.0008844065596349537, + "learning_rate": 0.2089877993074168, + "loss": 0.093, + "num_input_tokens_seen": 13437984, + "step": 14855 + }, + { + "epoch": 3.922000791870133, + "grad_norm": 0.0016905999509617686, + "learning_rate": 0.20893363581994784, + "loss": 0.0983, + "num_input_tokens_seen": 13442336, + "step": 14860 + }, + { + "epoch": 3.92332057542563, + "grad_norm": 0.003121489891782403, + "learning_rate": 0.2088794632441716, + "loss": 0.1008, + "num_input_tokens_seen": 13447072, + "step": 14865 + }, + { + "epoch": 3.924640358981127, + "grad_norm": 0.0018056321423500776, + "learning_rate": 0.20882528158844219, + "loss": 0.0715, + "num_input_tokens_seen": 13451808, + "step": 14870 + }, + { + "epoch": 3.925960142536624, + "grad_norm": 0.0008888181764632463, + "learning_rate": 0.20877109086111514, + "loss": 0.0676, + "num_input_tokens_seen": 13456416, + "step": 14875 + }, + { + "epoch": 3.9272799260921207, + "grad_norm": 0.0011211626697331667, + "learning_rate": 0.2087168910705473, + "loss": 0.0668, + "num_input_tokens_seen": 13460800, + "step": 14880 + }, + { + "epoch": 3.9285997096476177, + "grad_norm": 0.002297539496794343, + "learning_rate": 0.208662682225097, + "loss": 0.0875, + "num_input_tokens_seen": 13465248, + "step": 14885 + }, + { + "epoch": 3.9299194932031147, + "grad_norm": 0.0029707232024520636, + "learning_rate": 0.2086084643331239, + "loss": 0.0887, + "num_input_tokens_seen": 13469888, + "step": 14890 + }, + { + "epoch": 3.9312392767586117, + "grad_norm": 0.002377696568146348, + "learning_rate": 0.20855423740298906, + "loss": 0.097, + "num_input_tokens_seen": 13474368, + "step": 14895 + }, + { + "epoch": 3.9325590603141087, + "grad_norm": 0.002475721063092351, + "learning_rate": 0.208500001443055, + "loss": 0.1174, + "num_input_tokens_seen": 13478688, + "step": 14900 + }, + { + "epoch": 3.9338788438696053, + "grad_norm": 0.0019597713835537434, + "learning_rate": 0.20844575646168553, + "loss": 0.0757, + "num_input_tokens_seen": 13483104, + "step": 14905 + }, + { + "epoch": 3.9351986274251023, + "grad_norm": 0.0026029120199382305, + "learning_rate": 0.20839150246724594, + "loss": 0.1015, + "num_input_tokens_seen": 13487648, + "step": 14910 + }, + { + "epoch": 3.9365184109805993, + "grad_norm": 0.0016998512437567115, + "learning_rate": 0.20833723946810287, + "loss": 0.0758, + "num_input_tokens_seen": 13492416, + "step": 14915 + }, + { + "epoch": 3.937838194536096, + "grad_norm": 0.0015776589279994369, + "learning_rate": 0.20828296747262437, + "loss": 0.0849, + "num_input_tokens_seen": 13496640, + "step": 14920 + }, + { + "epoch": 3.939157978091593, + "grad_norm": 0.0013860843610018492, + "learning_rate": 0.20822868648917986, + "loss": 0.107, + "num_input_tokens_seen": 13501600, + "step": 14925 + }, + { + "epoch": 3.94047776164709, + "grad_norm": 0.0012015992542728782, + "learning_rate": 0.20817439652614017, + "loss": 0.1068, + "num_input_tokens_seen": 13506240, + "step": 14930 + }, + { + "epoch": 3.941797545202587, + "grad_norm": 0.0013378307921811938, + "learning_rate": 0.20812009759187744, + "loss": 0.0761, + "num_input_tokens_seen": 13510624, + "step": 14935 + }, + { + "epoch": 3.943117328758084, + "grad_norm": 0.0016448100795969367, + "learning_rate": 0.2080657896947653, + "loss": 0.1142, + "num_input_tokens_seen": 13515264, + "step": 14940 + }, + { + "epoch": 3.9444371123135804, + "grad_norm": 0.0020742686465382576, + "learning_rate": 0.2080114728431787, + "loss": 0.0608, + "num_input_tokens_seen": 13519552, + "step": 14945 + }, + { + "epoch": 3.9457568958690774, + "grad_norm": 0.001293789828196168, + "learning_rate": 0.20795714704549392, + "loss": 0.0616, + "num_input_tokens_seen": 13524032, + "step": 14950 + }, + { + "epoch": 3.9470766794245744, + "grad_norm": 0.0010956835467368364, + "learning_rate": 0.20790281231008875, + "loss": 0.0702, + "num_input_tokens_seen": 13528448, + "step": 14955 + }, + { + "epoch": 3.9483964629800714, + "grad_norm": 0.0009174896986223757, + "learning_rate": 0.20784846864534226, + "loss": 0.0745, + "num_input_tokens_seen": 13533056, + "step": 14960 + }, + { + "epoch": 3.9497162465355684, + "grad_norm": 0.0028892827685922384, + "learning_rate": 0.20779411605963496, + "loss": 0.0512, + "num_input_tokens_seen": 13537440, + "step": 14965 + }, + { + "epoch": 3.951036030091065, + "grad_norm": 0.0027253685984760523, + "learning_rate": 0.2077397545613487, + "loss": 0.1088, + "num_input_tokens_seen": 13541888, + "step": 14970 + }, + { + "epoch": 3.952355813646562, + "grad_norm": 0.0027172998525202274, + "learning_rate": 0.20768538415886661, + "loss": 0.084, + "num_input_tokens_seen": 13546528, + "step": 14975 + }, + { + "epoch": 3.953675597202059, + "grad_norm": 0.0009856659453362226, + "learning_rate": 0.20763100486057343, + "loss": 0.0648, + "num_input_tokens_seen": 13551072, + "step": 14980 + }, + { + "epoch": 3.9549953807575555, + "grad_norm": 0.0029364051297307014, + "learning_rate": 0.20757661667485502, + "loss": 0.0816, + "num_input_tokens_seen": 13555360, + "step": 14985 + }, + { + "epoch": 3.9563151643130525, + "grad_norm": 0.0006801997078582644, + "learning_rate": 0.2075222196100988, + "loss": 0.0657, + "num_input_tokens_seen": 13560032, + "step": 14990 + }, + { + "epoch": 3.9576349478685495, + "grad_norm": 0.0025477949529886246, + "learning_rate": 0.20746781367469344, + "loss": 0.116, + "num_input_tokens_seen": 13564608, + "step": 14995 + }, + { + "epoch": 3.9589547314240465, + "grad_norm": 0.002247289754450321, + "learning_rate": 0.207413398877029, + "loss": 0.0793, + "num_input_tokens_seen": 13569120, + "step": 15000 + }, + { + "epoch": 3.9589547314240465, + "eval_loss": 0.09848299622535706, + "eval_runtime": 75.9165, + "eval_samples_per_second": 88.716, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 13569120, + "step": 15000 + }, + { + "epoch": 3.9602745149795435, + "grad_norm": 0.0035421072971075773, + "learning_rate": 0.20735897522549698, + "loss": 0.1063, + "num_input_tokens_seen": 13573632, + "step": 15005 + }, + { + "epoch": 3.96159429853504, + "grad_norm": 0.00175861909519881, + "learning_rate": 0.2073045427284902, + "loss": 0.0636, + "num_input_tokens_seen": 13578016, + "step": 15010 + }, + { + "epoch": 3.962914082090537, + "grad_norm": 0.0012117784935981035, + "learning_rate": 0.2072501013944027, + "loss": 0.0579, + "num_input_tokens_seen": 13582240, + "step": 15015 + }, + { + "epoch": 3.964233865646034, + "grad_norm": 0.002757769776508212, + "learning_rate": 0.20719565123163017, + "loss": 0.0886, + "num_input_tokens_seen": 13586720, + "step": 15020 + }, + { + "epoch": 3.965553649201531, + "grad_norm": 0.0014189336216077209, + "learning_rate": 0.20714119224856944, + "loss": 0.0896, + "num_input_tokens_seen": 13591488, + "step": 15025 + }, + { + "epoch": 3.966873432757028, + "grad_norm": 0.0014196941629052162, + "learning_rate": 0.2070867244536188, + "loss": 0.0692, + "num_input_tokens_seen": 13596032, + "step": 15030 + }, + { + "epoch": 3.9681932163125246, + "grad_norm": 0.0019256898667663336, + "learning_rate": 0.20703224785517785, + "loss": 0.0668, + "num_input_tokens_seen": 13600512, + "step": 15035 + }, + { + "epoch": 3.9695129998680216, + "grad_norm": 0.0028970749117434025, + "learning_rate": 0.20697776246164754, + "loss": 0.0869, + "num_input_tokens_seen": 13605056, + "step": 15040 + }, + { + "epoch": 3.9708327834235186, + "grad_norm": 0.0025961108040064573, + "learning_rate": 0.2069232682814303, + "loss": 0.0906, + "num_input_tokens_seen": 13609408, + "step": 15045 + }, + { + "epoch": 3.972152566979015, + "grad_norm": 0.0010657196398824453, + "learning_rate": 0.20686876532292972, + "loss": 0.083, + "num_input_tokens_seen": 13613600, + "step": 15050 + }, + { + "epoch": 3.973472350534512, + "grad_norm": 0.002419683849439025, + "learning_rate": 0.20681425359455083, + "loss": 0.1006, + "num_input_tokens_seen": 13618304, + "step": 15055 + }, + { + "epoch": 3.974792134090009, + "grad_norm": 0.0022749665658921003, + "learning_rate": 0.20675973310470008, + "loss": 0.0797, + "num_input_tokens_seen": 13622752, + "step": 15060 + }, + { + "epoch": 3.976111917645506, + "grad_norm": 0.0009196614846587181, + "learning_rate": 0.2067052038617852, + "loss": 0.1188, + "num_input_tokens_seen": 13627552, + "step": 15065 + }, + { + "epoch": 3.977431701201003, + "grad_norm": 0.0032640742138028145, + "learning_rate": 0.2066506658742153, + "loss": 0.1065, + "num_input_tokens_seen": 13632224, + "step": 15070 + }, + { + "epoch": 3.9787514847564998, + "grad_norm": 0.002478277077898383, + "learning_rate": 0.20659611915040077, + "loss": 0.0888, + "num_input_tokens_seen": 13636672, + "step": 15075 + }, + { + "epoch": 3.9800712683119968, + "grad_norm": 0.0023461459204554558, + "learning_rate": 0.20654156369875348, + "loss": 0.0985, + "num_input_tokens_seen": 13641088, + "step": 15080 + }, + { + "epoch": 3.9813910518674938, + "grad_norm": 0.0009933849796652794, + "learning_rate": 0.20648699952768648, + "loss": 0.0786, + "num_input_tokens_seen": 13645856, + "step": 15085 + }, + { + "epoch": 3.9827108354229908, + "grad_norm": 0.003027458908036351, + "learning_rate": 0.20643242664561437, + "loss": 0.0973, + "num_input_tokens_seen": 13650432, + "step": 15090 + }, + { + "epoch": 3.9840306189784878, + "grad_norm": 0.0014646906638517976, + "learning_rate": 0.20637784506095277, + "loss": 0.0764, + "num_input_tokens_seen": 13654720, + "step": 15095 + }, + { + "epoch": 3.9853504025339843, + "grad_norm": 0.0014418527716770768, + "learning_rate": 0.20632325478211908, + "loss": 0.09, + "num_input_tokens_seen": 13659200, + "step": 15100 + }, + { + "epoch": 3.9866701860894813, + "grad_norm": 0.0012254492612555623, + "learning_rate": 0.20626865581753165, + "loss": 0.0743, + "num_input_tokens_seen": 13663648, + "step": 15105 + }, + { + "epoch": 3.9879899696449783, + "grad_norm": 0.002118075732141733, + "learning_rate": 0.2062140481756104, + "loss": 0.0833, + "num_input_tokens_seen": 13668192, + "step": 15110 + }, + { + "epoch": 3.989309753200475, + "grad_norm": 0.0018331055762246251, + "learning_rate": 0.20615943186477648, + "loss": 0.0848, + "num_input_tokens_seen": 13673056, + "step": 15115 + }, + { + "epoch": 3.990629536755972, + "grad_norm": 0.0031550719868391752, + "learning_rate": 0.20610480689345242, + "loss": 0.1169, + "num_input_tokens_seen": 13677152, + "step": 15120 + }, + { + "epoch": 3.991949320311469, + "grad_norm": 0.0018210795242339373, + "learning_rate": 0.2060501732700621, + "loss": 0.0899, + "num_input_tokens_seen": 13681664, + "step": 15125 + }, + { + "epoch": 3.993269103866966, + "grad_norm": 0.0020871926099061966, + "learning_rate": 0.20599553100303067, + "loss": 0.0914, + "num_input_tokens_seen": 13686144, + "step": 15130 + }, + { + "epoch": 3.994588887422463, + "grad_norm": 0.0014638841385021806, + "learning_rate": 0.20594088010078465, + "loss": 0.0968, + "num_input_tokens_seen": 13691264, + "step": 15135 + }, + { + "epoch": 3.9959086709779594, + "grad_norm": 0.0012678117491304874, + "learning_rate": 0.20588622057175196, + "loss": 0.1131, + "num_input_tokens_seen": 13695744, + "step": 15140 + }, + { + "epoch": 3.9972284545334564, + "grad_norm": 0.0017237425781786442, + "learning_rate": 0.20583155242436177, + "loss": 0.1359, + "num_input_tokens_seen": 13700160, + "step": 15145 + }, + { + "epoch": 3.9985482380889534, + "grad_norm": 0.0014819359639659524, + "learning_rate": 0.20577687566704453, + "loss": 0.08, + "num_input_tokens_seen": 13704768, + "step": 15150 + }, + { + "epoch": 3.9998680216444504, + "grad_norm": 0.002848415169864893, + "learning_rate": 0.20572219030823213, + "loss": 0.0665, + "num_input_tokens_seen": 13709216, + "step": 15155 + }, + { + "epoch": 4.001055826844397, + "grad_norm": 0.0007997734937816858, + "learning_rate": 0.20566749635635775, + "loss": 0.0492, + "num_input_tokens_seen": 13713232, + "step": 15160 + }, + { + "epoch": 4.002375610399894, + "grad_norm": 0.0011833346216008067, + "learning_rate": 0.20561279381985587, + "loss": 0.0839, + "num_input_tokens_seen": 13717776, + "step": 15165 + }, + { + "epoch": 4.003695393955391, + "grad_norm": 0.002184211974963546, + "learning_rate": 0.2055580827071623, + "loss": 0.0922, + "num_input_tokens_seen": 13722096, + "step": 15170 + }, + { + "epoch": 4.005015177510888, + "grad_norm": 0.0020191704388707876, + "learning_rate": 0.20550336302671418, + "loss": 0.0737, + "num_input_tokens_seen": 13726672, + "step": 15175 + }, + { + "epoch": 4.006334961066385, + "grad_norm": 0.00166821270249784, + "learning_rate": 0.20544863478695, + "loss": 0.0817, + "num_input_tokens_seen": 13731152, + "step": 15180 + }, + { + "epoch": 4.007654744621882, + "grad_norm": 0.0019028829410672188, + "learning_rate": 0.20539389799630953, + "loss": 0.0788, + "num_input_tokens_seen": 13735632, + "step": 15185 + }, + { + "epoch": 4.0089745281773785, + "grad_norm": 0.0010678151156753302, + "learning_rate": 0.20533915266323388, + "loss": 0.0936, + "num_input_tokens_seen": 13740400, + "step": 15190 + }, + { + "epoch": 4.010294311732876, + "grad_norm": 0.0021169581450521946, + "learning_rate": 0.20528439879616542, + "loss": 0.0786, + "num_input_tokens_seen": 13744880, + "step": 15195 + }, + { + "epoch": 4.0116140952883725, + "grad_norm": 0.0020183760207146406, + "learning_rate": 0.20522963640354794, + "loss": 0.0547, + "num_input_tokens_seen": 13749232, + "step": 15200 + }, + { + "epoch": 4.0116140952883725, + "eval_loss": 0.09538926929235458, + "eval_runtime": 76.0531, + "eval_samples_per_second": 88.556, + "eval_steps_per_second": 22.142, + "num_input_tokens_seen": 13749232, + "step": 15200 + }, + { + "epoch": 4.01293387884387, + "grad_norm": 0.0020857718773186207, + "learning_rate": 0.20517486549382644, + "loss": 0.11, + "num_input_tokens_seen": 13753776, + "step": 15205 + }, + { + "epoch": 4.0142536623993665, + "grad_norm": 0.0005381673690862954, + "learning_rate": 0.20512008607544735, + "loss": 0.1068, + "num_input_tokens_seen": 13758000, + "step": 15210 + }, + { + "epoch": 4.015573445954863, + "grad_norm": 0.0018678188789635897, + "learning_rate": 0.20506529815685826, + "loss": 0.0701, + "num_input_tokens_seen": 13762672, + "step": 15215 + }, + { + "epoch": 4.0168932295103605, + "grad_norm": 0.0024583758786320686, + "learning_rate": 0.2050105017465082, + "loss": 0.0809, + "num_input_tokens_seen": 13767312, + "step": 15220 + }, + { + "epoch": 4.018213013065857, + "grad_norm": 0.001854711095802486, + "learning_rate": 0.20495569685284754, + "loss": 0.1067, + "num_input_tokens_seen": 13771920, + "step": 15225 + }, + { + "epoch": 4.019532796621354, + "grad_norm": 0.0019312886288389564, + "learning_rate": 0.20490088348432778, + "loss": 0.0565, + "num_input_tokens_seen": 13776816, + "step": 15230 + }, + { + "epoch": 4.020852580176851, + "grad_norm": 0.0022296265233308077, + "learning_rate": 0.2048460616494018, + "loss": 0.0553, + "num_input_tokens_seen": 13781424, + "step": 15235 + }, + { + "epoch": 4.022172363732348, + "grad_norm": 0.002796100452542305, + "learning_rate": 0.2047912313565239, + "loss": 0.086, + "num_input_tokens_seen": 13785840, + "step": 15240 + }, + { + "epoch": 4.023492147287845, + "grad_norm": 0.003387344302609563, + "learning_rate": 0.20473639261414958, + "loss": 0.0875, + "num_input_tokens_seen": 13790480, + "step": 15245 + }, + { + "epoch": 4.024811930843342, + "grad_norm": 0.000903816195204854, + "learning_rate": 0.2046815454307357, + "loss": 0.0795, + "num_input_tokens_seen": 13795056, + "step": 15250 + }, + { + "epoch": 4.026131714398838, + "grad_norm": 0.0015000718412920833, + "learning_rate": 0.20462668981474028, + "loss": 0.0447, + "num_input_tokens_seen": 13799696, + "step": 15255 + }, + { + "epoch": 4.027451497954336, + "grad_norm": 0.001208805595524609, + "learning_rate": 0.20457182577462288, + "loss": 0.1295, + "num_input_tokens_seen": 13804336, + "step": 15260 + }, + { + "epoch": 4.028771281509832, + "grad_norm": 0.0011711795814335346, + "learning_rate": 0.2045169533188441, + "loss": 0.0677, + "num_input_tokens_seen": 13808656, + "step": 15265 + }, + { + "epoch": 4.03009106506533, + "grad_norm": 0.0014954106882214546, + "learning_rate": 0.20446207245586603, + "loss": 0.0965, + "num_input_tokens_seen": 13813104, + "step": 15270 + }, + { + "epoch": 4.031410848620826, + "grad_norm": 0.0012550886021927, + "learning_rate": 0.20440718319415196, + "loss": 0.0837, + "num_input_tokens_seen": 13817648, + "step": 15275 + }, + { + "epoch": 4.032730632176323, + "grad_norm": 0.0008482203120365739, + "learning_rate": 0.20435228554216653, + "loss": 0.0851, + "num_input_tokens_seen": 13822288, + "step": 15280 + }, + { + "epoch": 4.03405041573182, + "grad_norm": 0.0012399585684761405, + "learning_rate": 0.20429737950837565, + "loss": 0.0974, + "num_input_tokens_seen": 13826608, + "step": 15285 + }, + { + "epoch": 4.035370199287317, + "grad_norm": 0.001827690633945167, + "learning_rate": 0.20424246510124647, + "loss": 0.0705, + "num_input_tokens_seen": 13831120, + "step": 15290 + }, + { + "epoch": 4.036689982842814, + "grad_norm": 0.003211505478248, + "learning_rate": 0.20418754232924755, + "loss": 0.0533, + "num_input_tokens_seen": 13835536, + "step": 15295 + }, + { + "epoch": 4.038009766398311, + "grad_norm": 0.0020171564538031816, + "learning_rate": 0.20413261120084863, + "loss": 0.1184, + "num_input_tokens_seen": 13839856, + "step": 15300 + }, + { + "epoch": 4.039329549953807, + "grad_norm": 0.002562521491199732, + "learning_rate": 0.2040776717245208, + "loss": 0.0701, + "num_input_tokens_seen": 13844048, + "step": 15305 + }, + { + "epoch": 4.040649333509305, + "grad_norm": 0.0003951053658965975, + "learning_rate": 0.2040227239087364, + "loss": 0.1218, + "num_input_tokens_seen": 13848528, + "step": 15310 + }, + { + "epoch": 4.041969117064801, + "grad_norm": 0.00161907565779984, + "learning_rate": 0.20396776776196904, + "loss": 0.0924, + "num_input_tokens_seen": 13852944, + "step": 15315 + }, + { + "epoch": 4.043288900620298, + "grad_norm": 0.0011318265460431576, + "learning_rate": 0.20391280329269373, + "loss": 0.0584, + "num_input_tokens_seen": 13857232, + "step": 15320 + }, + { + "epoch": 4.044608684175795, + "grad_norm": 0.0013068412663415074, + "learning_rate": 0.20385783050938663, + "loss": 0.108, + "num_input_tokens_seen": 13861456, + "step": 15325 + }, + { + "epoch": 4.045928467731292, + "grad_norm": 0.0018132046097889543, + "learning_rate": 0.20380284942052526, + "loss": 0.0768, + "num_input_tokens_seen": 13866256, + "step": 15330 + }, + { + "epoch": 4.047248251286789, + "grad_norm": 0.0014795905444771051, + "learning_rate": 0.2037478600345884, + "loss": 0.0849, + "num_input_tokens_seen": 13870768, + "step": 15335 + }, + { + "epoch": 4.048568034842286, + "grad_norm": 0.0018336643697693944, + "learning_rate": 0.20369286236005604, + "loss": 0.0506, + "num_input_tokens_seen": 13875216, + "step": 15340 + }, + { + "epoch": 4.0498878183977824, + "grad_norm": 0.00193145708180964, + "learning_rate": 0.20363785640540957, + "loss": 0.1171, + "num_input_tokens_seen": 13879504, + "step": 15345 + }, + { + "epoch": 4.05120760195328, + "grad_norm": 0.0027316079940646887, + "learning_rate": 0.2035828421791316, + "loss": 0.0913, + "num_input_tokens_seen": 13884080, + "step": 15350 + }, + { + "epoch": 4.0525273855087764, + "grad_norm": 0.0024585153441876173, + "learning_rate": 0.20352781968970599, + "loss": 0.0581, + "num_input_tokens_seen": 13888752, + "step": 15355 + }, + { + "epoch": 4.053847169064274, + "grad_norm": 0.002059401711449027, + "learning_rate": 0.2034727889456179, + "loss": 0.1015, + "num_input_tokens_seen": 13893264, + "step": 15360 + }, + { + "epoch": 4.0551669526197704, + "grad_norm": 0.0019446308724582195, + "learning_rate": 0.2034177499553538, + "loss": 0.0515, + "num_input_tokens_seen": 13897648, + "step": 15365 + }, + { + "epoch": 4.056486736175267, + "grad_norm": 0.0011331748683005571, + "learning_rate": 0.2033627027274014, + "loss": 0.0532, + "num_input_tokens_seen": 13902288, + "step": 15370 + }, + { + "epoch": 4.0578065197307644, + "grad_norm": 0.0032707287464290857, + "learning_rate": 0.20330764727024955, + "loss": 0.1314, + "num_input_tokens_seen": 13906928, + "step": 15375 + }, + { + "epoch": 4.059126303286261, + "grad_norm": 0.0017393635353073478, + "learning_rate": 0.20325258359238868, + "loss": 0.0673, + "num_input_tokens_seen": 13911376, + "step": 15380 + }, + { + "epoch": 4.060446086841758, + "grad_norm": 0.0019783691968768835, + "learning_rate": 0.20319751170231018, + "loss": 0.0749, + "num_input_tokens_seen": 13915920, + "step": 15385 + }, + { + "epoch": 4.061765870397255, + "grad_norm": 0.0014591857325285673, + "learning_rate": 0.2031424316085068, + "loss": 0.0872, + "num_input_tokens_seen": 13920368, + "step": 15390 + }, + { + "epoch": 4.063085653952752, + "grad_norm": 0.0007502383668906987, + "learning_rate": 0.20308734331947265, + "loss": 0.0535, + "num_input_tokens_seen": 13924944, + "step": 15395 + }, + { + "epoch": 4.064405437508249, + "grad_norm": 0.0017126294551417232, + "learning_rate": 0.20303224684370305, + "loss": 0.0511, + "num_input_tokens_seen": 13929232, + "step": 15400 + }, + { + "epoch": 4.064405437508249, + "eval_loss": 0.10169105976819992, + "eval_runtime": 75.7705, + "eval_samples_per_second": 88.887, + "eval_steps_per_second": 22.225, + "num_input_tokens_seen": 13929232, + "step": 15400 + }, + { + "epoch": 4.065725221063746, + "grad_norm": 0.002769033657386899, + "learning_rate": 0.20297714218969456, + "loss": 0.0776, + "num_input_tokens_seen": 13933680, + "step": 15405 + }, + { + "epoch": 4.067045004619242, + "grad_norm": 0.0011308604152873158, + "learning_rate": 0.20292202936594497, + "loss": 0.043, + "num_input_tokens_seen": 13938160, + "step": 15410 + }, + { + "epoch": 4.06836478817474, + "grad_norm": 0.002514026127755642, + "learning_rate": 0.2028669083809534, + "loss": 0.0743, + "num_input_tokens_seen": 13942512, + "step": 15415 + }, + { + "epoch": 4.069684571730236, + "grad_norm": 0.0013220981927588582, + "learning_rate": 0.20281177924322016, + "loss": 0.0781, + "num_input_tokens_seen": 13947152, + "step": 15420 + }, + { + "epoch": 4.071004355285734, + "grad_norm": 0.0026810963172465563, + "learning_rate": 0.2027566419612469, + "loss": 0.0729, + "num_input_tokens_seen": 13951696, + "step": 15425 + }, + { + "epoch": 4.07232413884123, + "grad_norm": 0.0012277431087568402, + "learning_rate": 0.20270149654353647, + "loss": 0.0731, + "num_input_tokens_seen": 13956496, + "step": 15430 + }, + { + "epoch": 4.073643922396727, + "grad_norm": 0.001998962601646781, + "learning_rate": 0.202646342998593, + "loss": 0.0715, + "num_input_tokens_seen": 13960624, + "step": 15435 + }, + { + "epoch": 4.074963705952224, + "grad_norm": 0.0022690880578011274, + "learning_rate": 0.20259118133492185, + "loss": 0.0627, + "num_input_tokens_seen": 13965168, + "step": 15440 + }, + { + "epoch": 4.076283489507721, + "grad_norm": 0.0030577746219933033, + "learning_rate": 0.20253601156102966, + "loss": 0.1196, + "num_input_tokens_seen": 13970032, + "step": 15445 + }, + { + "epoch": 4.077603273063217, + "grad_norm": 0.0030840574763715267, + "learning_rate": 0.20248083368542422, + "loss": 0.041, + "num_input_tokens_seen": 13974416, + "step": 15450 + }, + { + "epoch": 4.078923056618715, + "grad_norm": 0.00199921359308064, + "learning_rate": 0.2024256477166147, + "loss": 0.0862, + "num_input_tokens_seen": 13978992, + "step": 15455 + }, + { + "epoch": 4.080242840174211, + "grad_norm": 0.0014009441947564483, + "learning_rate": 0.2023704536631115, + "loss": 0.0976, + "num_input_tokens_seen": 13983824, + "step": 15460 + }, + { + "epoch": 4.081562623729709, + "grad_norm": 0.0021826920565217733, + "learning_rate": 0.20231525153342625, + "loss": 0.0572, + "num_input_tokens_seen": 13988656, + "step": 15465 + }, + { + "epoch": 4.082882407285205, + "grad_norm": 0.0018664810340851545, + "learning_rate": 0.20226004133607173, + "loss": 0.063, + "num_input_tokens_seen": 13992816, + "step": 15470 + }, + { + "epoch": 4.084202190840702, + "grad_norm": 0.0012311659520491958, + "learning_rate": 0.20220482307956214, + "loss": 0.0578, + "num_input_tokens_seen": 13997264, + "step": 15475 + }, + { + "epoch": 4.085521974396199, + "grad_norm": 0.0021250429563224316, + "learning_rate": 0.20214959677241276, + "loss": 0.054, + "num_input_tokens_seen": 14001680, + "step": 15480 + }, + { + "epoch": 4.086841757951696, + "grad_norm": 0.0003497425059322268, + "learning_rate": 0.20209436242314022, + "loss": 0.0886, + "num_input_tokens_seen": 14006160, + "step": 15485 + }, + { + "epoch": 4.088161541507193, + "grad_norm": 0.0004490225692279637, + "learning_rate": 0.2020391200402623, + "loss": 0.1067, + "num_input_tokens_seen": 14010992, + "step": 15490 + }, + { + "epoch": 4.08948132506269, + "grad_norm": 0.0013145795091986656, + "learning_rate": 0.2019838696322981, + "loss": 0.0619, + "num_input_tokens_seen": 14015600, + "step": 15495 + }, + { + "epoch": 4.090801108618186, + "grad_norm": 0.0008837248315103352, + "learning_rate": 0.20192861120776798, + "loss": 0.0695, + "num_input_tokens_seen": 14020240, + "step": 15500 + }, + { + "epoch": 4.092120892173684, + "grad_norm": 0.005429161712527275, + "learning_rate": 0.20187334477519345, + "loss": 0.0718, + "num_input_tokens_seen": 14024880, + "step": 15505 + }, + { + "epoch": 4.09344067572918, + "grad_norm": 0.0023596840910613537, + "learning_rate": 0.20181807034309726, + "loss": 0.0566, + "num_input_tokens_seen": 14029264, + "step": 15510 + }, + { + "epoch": 4.094760459284677, + "grad_norm": 0.0012512713437899947, + "learning_rate": 0.2017627879200034, + "loss": 0.0367, + "num_input_tokens_seen": 14033680, + "step": 15515 + }, + { + "epoch": 4.096080242840174, + "grad_norm": 0.001485916436649859, + "learning_rate": 0.2017074975144372, + "loss": 0.0363, + "num_input_tokens_seen": 14038096, + "step": 15520 + }, + { + "epoch": 4.097400026395671, + "grad_norm": 0.004231830593198538, + "learning_rate": 0.20165219913492508, + "loss": 0.1144, + "num_input_tokens_seen": 14042672, + "step": 15525 + }, + { + "epoch": 4.098719809951168, + "grad_norm": 0.0015014742966741323, + "learning_rate": 0.20159689278999468, + "loss": 0.0449, + "num_input_tokens_seen": 14046960, + "step": 15530 + }, + { + "epoch": 4.100039593506665, + "grad_norm": 0.0029295007698237896, + "learning_rate": 0.20154157848817508, + "loss": 0.0565, + "num_input_tokens_seen": 14051440, + "step": 15535 + }, + { + "epoch": 4.1013593770621615, + "grad_norm": 0.0013588241999968886, + "learning_rate": 0.20148625623799632, + "loss": 0.0588, + "num_input_tokens_seen": 14055984, + "step": 15540 + }, + { + "epoch": 4.102679160617659, + "grad_norm": 0.0025837435387074947, + "learning_rate": 0.20143092604798984, + "loss": 0.0509, + "num_input_tokens_seen": 14060592, + "step": 15545 + }, + { + "epoch": 4.1039989441731555, + "grad_norm": 0.004300371743738651, + "learning_rate": 0.2013755879266883, + "loss": 0.0816, + "num_input_tokens_seen": 14065328, + "step": 15550 + }, + { + "epoch": 4.105318727728653, + "grad_norm": 0.001269783009774983, + "learning_rate": 0.20132024188262543, + "loss": 0.1011, + "num_input_tokens_seen": 14070160, + "step": 15555 + }, + { + "epoch": 4.1066385112841495, + "grad_norm": 0.0022221265826374292, + "learning_rate": 0.2012648879243363, + "loss": 0.0337, + "num_input_tokens_seen": 14074256, + "step": 15560 + }, + { + "epoch": 4.107958294839646, + "grad_norm": 0.002577871084213257, + "learning_rate": 0.20120952606035725, + "loss": 0.0599, + "num_input_tokens_seen": 14079024, + "step": 15565 + }, + { + "epoch": 4.1092780783951435, + "grad_norm": 0.002778481226414442, + "learning_rate": 0.20115415629922576, + "loss": 0.0485, + "num_input_tokens_seen": 14083632, + "step": 15570 + }, + { + "epoch": 4.11059786195064, + "grad_norm": 0.002542924601584673, + "learning_rate": 0.20109877864948048, + "loss": 0.1191, + "num_input_tokens_seen": 14088144, + "step": 15575 + }, + { + "epoch": 4.111917645506137, + "grad_norm": 0.002188686979934573, + "learning_rate": 0.20104339311966138, + "loss": 0.0932, + "num_input_tokens_seen": 14092496, + "step": 15580 + }, + { + "epoch": 4.113237429061634, + "grad_norm": 0.003812576411291957, + "learning_rate": 0.2009879997183097, + "loss": 0.1165, + "num_input_tokens_seen": 14097232, + "step": 15585 + }, + { + "epoch": 4.114557212617131, + "grad_norm": 0.0021653652656823397, + "learning_rate": 0.20093259845396763, + "loss": 0.1323, + "num_input_tokens_seen": 14102032, + "step": 15590 + }, + { + "epoch": 4.115876996172628, + "grad_norm": 0.002482086420059204, + "learning_rate": 0.20087718933517884, + "loss": 0.0622, + "num_input_tokens_seen": 14106512, + "step": 15595 + }, + { + "epoch": 4.117196779728125, + "grad_norm": 0.001888201106339693, + "learning_rate": 0.20082177237048807, + "loss": 0.0941, + "num_input_tokens_seen": 14111056, + "step": 15600 + }, + { + "epoch": 4.117196779728125, + "eval_loss": 0.0976809486746788, + "eval_runtime": 75.9879, + "eval_samples_per_second": 88.633, + "eval_steps_per_second": 22.161, + "num_input_tokens_seen": 14111056, + "step": 15600 + }, + { + "epoch": 4.118516563283621, + "grad_norm": 0.0021814594510942698, + "learning_rate": 0.20076634756844133, + "loss": 0.0704, + "num_input_tokens_seen": 14115408, + "step": 15605 + }, + { + "epoch": 4.119836346839119, + "grad_norm": 0.003323210868984461, + "learning_rate": 0.20071091493758586, + "loss": 0.0759, + "num_input_tokens_seen": 14119728, + "step": 15610 + }, + { + "epoch": 4.121156130394615, + "grad_norm": 0.001818074961192906, + "learning_rate": 0.20065547448647003, + "loss": 0.0789, + "num_input_tokens_seen": 14123888, + "step": 15615 + }, + { + "epoch": 4.122475913950113, + "grad_norm": 0.0015126349171623588, + "learning_rate": 0.20060002622364348, + "loss": 0.0581, + "num_input_tokens_seen": 14128400, + "step": 15620 + }, + { + "epoch": 4.123795697505609, + "grad_norm": 0.0005003156256861985, + "learning_rate": 0.20054457015765695, + "loss": 0.0764, + "num_input_tokens_seen": 14132688, + "step": 15625 + }, + { + "epoch": 4.125115481061106, + "grad_norm": 0.0027342652902007103, + "learning_rate": 0.20048910629706254, + "loss": 0.0549, + "num_input_tokens_seen": 14136944, + "step": 15630 + }, + { + "epoch": 4.126435264616603, + "grad_norm": 0.0019587024580687284, + "learning_rate": 0.20043363465041347, + "loss": 0.1112, + "num_input_tokens_seen": 14141424, + "step": 15635 + }, + { + "epoch": 4.1277550481721, + "grad_norm": 0.004133149981498718, + "learning_rate": 0.2003781552262641, + "loss": 0.0949, + "num_input_tokens_seen": 14146128, + "step": 15640 + }, + { + "epoch": 4.129074831727596, + "grad_norm": 0.0008223020704463124, + "learning_rate": 0.20032266803317014, + "loss": 0.0814, + "num_input_tokens_seen": 14150704, + "step": 15645 + }, + { + "epoch": 4.130394615283094, + "grad_norm": 0.0013281669234856963, + "learning_rate": 0.2002671730796884, + "loss": 0.0634, + "num_input_tokens_seen": 14155376, + "step": 15650 + }, + { + "epoch": 4.13171439883859, + "grad_norm": 0.0029946642462164164, + "learning_rate": 0.20021167037437684, + "loss": 0.0585, + "num_input_tokens_seen": 14159952, + "step": 15655 + }, + { + "epoch": 4.133034182394088, + "grad_norm": 0.003931584767997265, + "learning_rate": 0.20015615992579472, + "loss": 0.1127, + "num_input_tokens_seen": 14164624, + "step": 15660 + }, + { + "epoch": 4.134353965949584, + "grad_norm": 0.0019873117562383413, + "learning_rate": 0.20010064174250244, + "loss": 0.0487, + "num_input_tokens_seen": 14169264, + "step": 15665 + }, + { + "epoch": 4.135673749505081, + "grad_norm": 0.0034383574966341257, + "learning_rate": 0.2000451158330616, + "loss": 0.1238, + "num_input_tokens_seen": 14173872, + "step": 15670 + }, + { + "epoch": 4.136993533060578, + "grad_norm": 0.0016794995171949267, + "learning_rate": 0.199989582206035, + "loss": 0.0836, + "num_input_tokens_seen": 14178768, + "step": 15675 + }, + { + "epoch": 4.138313316616075, + "grad_norm": 0.003148111281916499, + "learning_rate": 0.1999340408699866, + "loss": 0.0958, + "num_input_tokens_seen": 14183120, + "step": 15680 + }, + { + "epoch": 4.139633100171572, + "grad_norm": 0.0018257568590342999, + "learning_rate": 0.19987849183348155, + "loss": 0.0902, + "num_input_tokens_seen": 14187312, + "step": 15685 + }, + { + "epoch": 4.140952883727069, + "grad_norm": 0.0010908214608207345, + "learning_rate": 0.19982293510508628, + "loss": 0.0974, + "num_input_tokens_seen": 14192112, + "step": 15690 + }, + { + "epoch": 4.142272667282565, + "grad_norm": 0.00170087069272995, + "learning_rate": 0.19976737069336833, + "loss": 0.1055, + "num_input_tokens_seen": 14196528, + "step": 15695 + }, + { + "epoch": 4.143592450838063, + "grad_norm": 0.0024097990244627, + "learning_rate": 0.1997117986068964, + "loss": 0.0805, + "num_input_tokens_seen": 14201168, + "step": 15700 + }, + { + "epoch": 4.144912234393559, + "grad_norm": 0.0013964036479592323, + "learning_rate": 0.19965621885424037, + "loss": 0.0622, + "num_input_tokens_seen": 14205776, + "step": 15705 + }, + { + "epoch": 4.146232017949056, + "grad_norm": 0.001619250513613224, + "learning_rate": 0.19960063144397142, + "loss": 0.0913, + "num_input_tokens_seen": 14210192, + "step": 15710 + }, + { + "epoch": 4.147551801504553, + "grad_norm": 0.0009074202389456332, + "learning_rate": 0.19954503638466176, + "loss": 0.0764, + "num_input_tokens_seen": 14214736, + "step": 15715 + }, + { + "epoch": 4.14887158506005, + "grad_norm": 0.0019515380263328552, + "learning_rate": 0.1994894336848848, + "loss": 0.052, + "num_input_tokens_seen": 14219120, + "step": 15720 + }, + { + "epoch": 4.150191368615547, + "grad_norm": 0.0015166581142693758, + "learning_rate": 0.1994338233532153, + "loss": 0.0437, + "num_input_tokens_seen": 14223824, + "step": 15725 + }, + { + "epoch": 4.151511152171044, + "grad_norm": 0.0014276168076321483, + "learning_rate": 0.19937820539822904, + "loss": 0.0708, + "num_input_tokens_seen": 14228464, + "step": 15730 + }, + { + "epoch": 4.1528309357265405, + "grad_norm": 0.0025784142781049013, + "learning_rate": 0.199322579828503, + "loss": 0.0658, + "num_input_tokens_seen": 14232976, + "step": 15735 + }, + { + "epoch": 4.154150719282038, + "grad_norm": 0.002174254273995757, + "learning_rate": 0.19926694665261527, + "loss": 0.067, + "num_input_tokens_seen": 14237616, + "step": 15740 + }, + { + "epoch": 4.1554705028375345, + "grad_norm": 0.0027578261215239763, + "learning_rate": 0.19921130587914526, + "loss": 0.125, + "num_input_tokens_seen": 14242128, + "step": 15745 + }, + { + "epoch": 4.156790286393032, + "grad_norm": 0.0022979590576142073, + "learning_rate": 0.19915565751667344, + "loss": 0.0899, + "num_input_tokens_seen": 14246480, + "step": 15750 + }, + { + "epoch": 4.1581100699485285, + "grad_norm": 0.0018580625765025616, + "learning_rate": 0.19910000157378152, + "loss": 0.103, + "num_input_tokens_seen": 14250960, + "step": 15755 + }, + { + "epoch": 4.159429853504025, + "grad_norm": 0.001996812876313925, + "learning_rate": 0.1990443380590523, + "loss": 0.1265, + "num_input_tokens_seen": 14255728, + "step": 15760 + }, + { + "epoch": 4.1607496370595225, + "grad_norm": 0.002106146886944771, + "learning_rate": 0.19898866698106984, + "loss": 0.0784, + "num_input_tokens_seen": 14260176, + "step": 15765 + }, + { + "epoch": 4.162069420615019, + "grad_norm": 0.0006216246983967721, + "learning_rate": 0.19893298834841933, + "loss": 0.0449, + "num_input_tokens_seen": 14264720, + "step": 15770 + }, + { + "epoch": 4.163389204170516, + "grad_norm": 0.0014714893186464906, + "learning_rate": 0.19887730216968705, + "loss": 0.0715, + "num_input_tokens_seen": 14269232, + "step": 15775 + }, + { + "epoch": 4.164708987726013, + "grad_norm": 0.0014148239279165864, + "learning_rate": 0.19882160845346053, + "loss": 0.0836, + "num_input_tokens_seen": 14273424, + "step": 15780 + }, + { + "epoch": 4.16602877128151, + "grad_norm": 0.0024820533581078053, + "learning_rate": 0.1987659072083285, + "loss": 0.0713, + "num_input_tokens_seen": 14277616, + "step": 15785 + }, + { + "epoch": 4.167348554837007, + "grad_norm": 0.002964932005852461, + "learning_rate": 0.1987101984428807, + "loss": 0.0933, + "num_input_tokens_seen": 14281840, + "step": 15790 + }, + { + "epoch": 4.168668338392504, + "grad_norm": 0.0016717803664505482, + "learning_rate": 0.19865448216570822, + "loss": 0.0814, + "num_input_tokens_seen": 14286384, + "step": 15795 + }, + { + "epoch": 4.169988121948, + "grad_norm": 0.001183600747026503, + "learning_rate": 0.19859875838540317, + "loss": 0.0524, + "num_input_tokens_seen": 14290896, + "step": 15800 + }, + { + "epoch": 4.169988121948, + "eval_loss": 0.09611785411834717, + "eval_runtime": 75.8885, + "eval_samples_per_second": 88.749, + "eval_steps_per_second": 22.19, + "num_input_tokens_seen": 14290896, + "step": 15800 + }, + { + "epoch": 4.171307905503498, + "grad_norm": 0.003887522267177701, + "learning_rate": 0.1985430271105588, + "loss": 0.0947, + "num_input_tokens_seen": 14295408, + "step": 15805 + }, + { + "epoch": 4.172627689058994, + "grad_norm": 0.0007056962931528687, + "learning_rate": 0.19848728834976961, + "loss": 0.0581, + "num_input_tokens_seen": 14299824, + "step": 15810 + }, + { + "epoch": 4.173947472614492, + "grad_norm": 0.0035958096850663424, + "learning_rate": 0.19843154211163128, + "loss": 0.0939, + "num_input_tokens_seen": 14304240, + "step": 15815 + }, + { + "epoch": 4.175267256169988, + "grad_norm": 0.00183185376226902, + "learning_rate": 0.1983757884047405, + "loss": 0.0527, + "num_input_tokens_seen": 14308720, + "step": 15820 + }, + { + "epoch": 4.176587039725485, + "grad_norm": 0.0020346129313111305, + "learning_rate": 0.1983200272376952, + "loss": 0.1076, + "num_input_tokens_seen": 14313328, + "step": 15825 + }, + { + "epoch": 4.177906823280982, + "grad_norm": 0.0015479744179174304, + "learning_rate": 0.1982642586190945, + "loss": 0.0485, + "num_input_tokens_seen": 14317840, + "step": 15830 + }, + { + "epoch": 4.179226606836479, + "grad_norm": 0.001616760273464024, + "learning_rate": 0.1982084825575386, + "loss": 0.0707, + "num_input_tokens_seen": 14322416, + "step": 15835 + }, + { + "epoch": 4.180546390391975, + "grad_norm": 0.0032889102585613728, + "learning_rate": 0.19815269906162883, + "loss": 0.0657, + "num_input_tokens_seen": 14327088, + "step": 15840 + }, + { + "epoch": 4.181866173947473, + "grad_norm": 0.0015465839533135295, + "learning_rate": 0.19809690813996775, + "loss": 0.0622, + "num_input_tokens_seen": 14331760, + "step": 15845 + }, + { + "epoch": 4.183185957502969, + "grad_norm": 0.0012226297985762358, + "learning_rate": 0.19804110980115905, + "loss": 0.0667, + "num_input_tokens_seen": 14335536, + "step": 15850 + }, + { + "epoch": 4.184505741058467, + "grad_norm": 0.0019158724462613463, + "learning_rate": 0.19798530405380746, + "loss": 0.0818, + "num_input_tokens_seen": 14340080, + "step": 15855 + }, + { + "epoch": 4.185825524613963, + "grad_norm": 0.0027224677614867687, + "learning_rate": 0.19792949090651893, + "loss": 0.0844, + "num_input_tokens_seen": 14344720, + "step": 15860 + }, + { + "epoch": 4.18714530816946, + "grad_norm": 0.003892782609909773, + "learning_rate": 0.19787367036790066, + "loss": 0.0788, + "num_input_tokens_seen": 14349296, + "step": 15865 + }, + { + "epoch": 4.188465091724957, + "grad_norm": 0.00278877024538815, + "learning_rate": 0.19781784244656075, + "loss": 0.0743, + "num_input_tokens_seen": 14353904, + "step": 15870 + }, + { + "epoch": 4.189784875280454, + "grad_norm": 0.0031428185757249594, + "learning_rate": 0.19776200715110864, + "loss": 0.0857, + "num_input_tokens_seen": 14358352, + "step": 15875 + }, + { + "epoch": 4.191104658835951, + "grad_norm": 0.001732244505546987, + "learning_rate": 0.1977061644901548, + "loss": 0.0844, + "num_input_tokens_seen": 14362704, + "step": 15880 + }, + { + "epoch": 4.192424442391448, + "grad_norm": 0.0012398234102874994, + "learning_rate": 0.1976503144723109, + "loss": 0.0686, + "num_input_tokens_seen": 14367280, + "step": 15885 + }, + { + "epoch": 4.193744225946944, + "grad_norm": 0.0010928320698440075, + "learning_rate": 0.19759445710618967, + "loss": 0.0767, + "num_input_tokens_seen": 14371376, + "step": 15890 + }, + { + "epoch": 4.195064009502442, + "grad_norm": 0.0019087193068116903, + "learning_rate": 0.19753859240040508, + "loss": 0.0514, + "num_input_tokens_seen": 14376080, + "step": 15895 + }, + { + "epoch": 4.196383793057938, + "grad_norm": 0.00125025468878448, + "learning_rate": 0.1974827203635721, + "loss": 0.0696, + "num_input_tokens_seen": 14380624, + "step": 15900 + }, + { + "epoch": 4.197703576613435, + "grad_norm": 0.0009079008013941348, + "learning_rate": 0.19742684100430694, + "loss": 0.0512, + "num_input_tokens_seen": 14385104, + "step": 15905 + }, + { + "epoch": 4.199023360168932, + "grad_norm": 0.0010012845741584897, + "learning_rate": 0.19737095433122692, + "loss": 0.0393, + "num_input_tokens_seen": 14389584, + "step": 15910 + }, + { + "epoch": 4.200343143724429, + "grad_norm": 0.00256548379547894, + "learning_rate": 0.19731506035295046, + "loss": 0.0654, + "num_input_tokens_seen": 14394256, + "step": 15915 + }, + { + "epoch": 4.201662927279926, + "grad_norm": 0.0035691664088517427, + "learning_rate": 0.19725915907809702, + "loss": 0.1155, + "num_input_tokens_seen": 14399216, + "step": 15920 + }, + { + "epoch": 4.202982710835423, + "grad_norm": 0.0008940082043409348, + "learning_rate": 0.1972032505152874, + "loss": 0.085, + "num_input_tokens_seen": 14403600, + "step": 15925 + }, + { + "epoch": 4.2043024943909195, + "grad_norm": 0.0018379612592980266, + "learning_rate": 0.19714733467314338, + "loss": 0.0751, + "num_input_tokens_seen": 14407888, + "step": 15930 + }, + { + "epoch": 4.205622277946417, + "grad_norm": 0.0022794168908149004, + "learning_rate": 0.19709141156028784, + "loss": 0.0522, + "num_input_tokens_seen": 14412144, + "step": 15935 + }, + { + "epoch": 4.2069420615019135, + "grad_norm": 0.0015346651198342443, + "learning_rate": 0.1970354811853448, + "loss": 0.1179, + "num_input_tokens_seen": 14416560, + "step": 15940 + }, + { + "epoch": 4.208261845057411, + "grad_norm": 0.0027796330396085978, + "learning_rate": 0.19697954355693953, + "loss": 0.1268, + "num_input_tokens_seen": 14421488, + "step": 15945 + }, + { + "epoch": 4.2095816286129075, + "grad_norm": 0.0011701708426699042, + "learning_rate": 0.19692359868369827, + "loss": 0.0719, + "num_input_tokens_seen": 14426224, + "step": 15950 + }, + { + "epoch": 4.210901412168404, + "grad_norm": 0.00199297908693552, + "learning_rate": 0.1968676465742484, + "loss": 0.071, + "num_input_tokens_seen": 14430064, + "step": 15955 + }, + { + "epoch": 4.2122211957239015, + "grad_norm": 0.0021665149834007025, + "learning_rate": 0.19681168723721845, + "loss": 0.0803, + "num_input_tokens_seen": 14434672, + "step": 15960 + }, + { + "epoch": 4.213540979279398, + "grad_norm": 0.001407565432600677, + "learning_rate": 0.19675572068123803, + "loss": 0.053, + "num_input_tokens_seen": 14439184, + "step": 15965 + }, + { + "epoch": 4.2148607628348955, + "grad_norm": 0.0025001028552651405, + "learning_rate": 0.19669974691493794, + "loss": 0.0826, + "num_input_tokens_seen": 14443824, + "step": 15970 + }, + { + "epoch": 4.216180546390392, + "grad_norm": 0.0024883721489459276, + "learning_rate": 0.19664376594695002, + "loss": 0.0687, + "num_input_tokens_seen": 14448176, + "step": 15975 + }, + { + "epoch": 4.217500329945889, + "grad_norm": 0.0014956771628931165, + "learning_rate": 0.19658777778590722, + "loss": 0.059, + "num_input_tokens_seen": 14452656, + "step": 15980 + }, + { + "epoch": 4.218820113501386, + "grad_norm": 0.0023137812968343496, + "learning_rate": 0.19653178244044364, + "loss": 0.0421, + "num_input_tokens_seen": 14457200, + "step": 15985 + }, + { + "epoch": 4.220139897056883, + "grad_norm": 0.002465044381096959, + "learning_rate": 0.19647577991919443, + "loss": 0.0545, + "num_input_tokens_seen": 14461552, + "step": 15990 + }, + { + "epoch": 4.221459680612379, + "grad_norm": 0.003460256615653634, + "learning_rate": 0.1964197702307959, + "loss": 0.0749, + "num_input_tokens_seen": 14465872, + "step": 15995 + }, + { + "epoch": 4.222779464167877, + "grad_norm": 0.0006046048365533352, + "learning_rate": 0.19636375338388545, + "loss": 0.0615, + "num_input_tokens_seen": 14470384, + "step": 16000 + }, + { + "epoch": 4.222779464167877, + "eval_loss": 0.10984824597835541, + "eval_runtime": 75.9482, + "eval_samples_per_second": 88.679, + "eval_steps_per_second": 22.173, + "num_input_tokens_seen": 14470384, + "step": 16000 + }, + { + "epoch": 4.224099247723373, + "grad_norm": 0.0021187535021454096, + "learning_rate": 0.1963077293871016, + "loss": 0.0581, + "num_input_tokens_seen": 14474960, + "step": 16005 + }, + { + "epoch": 4.225419031278871, + "grad_norm": 0.0035836156457662582, + "learning_rate": 0.19625169824908395, + "loss": 0.0756, + "num_input_tokens_seen": 14479376, + "step": 16010 + }, + { + "epoch": 4.226738814834367, + "grad_norm": 0.0029489558655768633, + "learning_rate": 0.19619565997847319, + "loss": 0.0783, + "num_input_tokens_seen": 14483952, + "step": 16015 + }, + { + "epoch": 4.228058598389864, + "grad_norm": 0.001942329341545701, + "learning_rate": 0.19613961458391113, + "loss": 0.0778, + "num_input_tokens_seen": 14488240, + "step": 16020 + }, + { + "epoch": 4.229378381945361, + "grad_norm": 0.0010883286595344543, + "learning_rate": 0.19608356207404065, + "loss": 0.0414, + "num_input_tokens_seen": 14492592, + "step": 16025 + }, + { + "epoch": 4.230698165500858, + "grad_norm": 0.0022955003660172224, + "learning_rate": 0.1960275024575058, + "loss": 0.0853, + "num_input_tokens_seen": 14497136, + "step": 16030 + }, + { + "epoch": 4.232017949056354, + "grad_norm": 0.0026303292252123356, + "learning_rate": 0.19597143574295164, + "loss": 0.1176, + "num_input_tokens_seen": 14501648, + "step": 16035 + }, + { + "epoch": 4.233337732611852, + "grad_norm": 0.003366470104083419, + "learning_rate": 0.1959153619390244, + "loss": 0.1134, + "num_input_tokens_seen": 14506160, + "step": 16040 + }, + { + "epoch": 4.234657516167348, + "grad_norm": 0.000509831530507654, + "learning_rate": 0.1958592810543713, + "loss": 0.0531, + "num_input_tokens_seen": 14510992, + "step": 16045 + }, + { + "epoch": 4.235977299722846, + "grad_norm": 0.001259775715880096, + "learning_rate": 0.19580319309764077, + "loss": 0.0748, + "num_input_tokens_seen": 14515536, + "step": 16050 + }, + { + "epoch": 4.237297083278342, + "grad_norm": 0.002836571540683508, + "learning_rate": 0.1957470980774823, + "loss": 0.0783, + "num_input_tokens_seen": 14519952, + "step": 16055 + }, + { + "epoch": 4.238616866833839, + "grad_norm": 0.002023922046646476, + "learning_rate": 0.19569099600254639, + "loss": 0.0686, + "num_input_tokens_seen": 14524720, + "step": 16060 + }, + { + "epoch": 4.239936650389336, + "grad_norm": 0.0017703116172924638, + "learning_rate": 0.1956348868814847, + "loss": 0.0884, + "num_input_tokens_seen": 14529232, + "step": 16065 + }, + { + "epoch": 4.241256433944833, + "grad_norm": 0.0030247694812715054, + "learning_rate": 0.19557877072295, + "loss": 0.0749, + "num_input_tokens_seen": 14533744, + "step": 16070 + }, + { + "epoch": 4.24257621750033, + "grad_norm": 0.0006427863845601678, + "learning_rate": 0.19552264753559603, + "loss": 0.0811, + "num_input_tokens_seen": 14538160, + "step": 16075 + }, + { + "epoch": 4.243896001055827, + "grad_norm": 0.001486246706917882, + "learning_rate": 0.19546651732807774, + "loss": 0.0805, + "num_input_tokens_seen": 14542800, + "step": 16080 + }, + { + "epoch": 4.245215784611323, + "grad_norm": 0.0004013552388641983, + "learning_rate": 0.19541038010905112, + "loss": 0.0738, + "num_input_tokens_seen": 14546960, + "step": 16085 + }, + { + "epoch": 4.246535568166821, + "grad_norm": 0.002154212910681963, + "learning_rate": 0.19535423588717324, + "loss": 0.1032, + "num_input_tokens_seen": 14551536, + "step": 16090 + }, + { + "epoch": 4.247855351722317, + "grad_norm": 0.0017016150522977114, + "learning_rate": 0.19529808467110224, + "loss": 0.0762, + "num_input_tokens_seen": 14556112, + "step": 16095 + }, + { + "epoch": 4.249175135277815, + "grad_norm": 0.0017536170780658722, + "learning_rate": 0.19524192646949734, + "loss": 0.0623, + "num_input_tokens_seen": 14560336, + "step": 16100 + }, + { + "epoch": 4.250494918833311, + "grad_norm": 0.0020875167101621628, + "learning_rate": 0.19518576129101878, + "loss": 0.0792, + "num_input_tokens_seen": 14564912, + "step": 16105 + }, + { + "epoch": 4.251814702388808, + "grad_norm": 0.002075232332572341, + "learning_rate": 0.19512958914432804, + "loss": 0.0697, + "num_input_tokens_seen": 14569584, + "step": 16110 + }, + { + "epoch": 4.253134485944305, + "grad_norm": 0.0011688168160617352, + "learning_rate": 0.1950734100380875, + "loss": 0.0697, + "num_input_tokens_seen": 14573840, + "step": 16115 + }, + { + "epoch": 4.254454269499802, + "grad_norm": 0.0016651249025017023, + "learning_rate": 0.19501722398096066, + "loss": 0.074, + "num_input_tokens_seen": 14578160, + "step": 16120 + }, + { + "epoch": 4.2557740530552985, + "grad_norm": 0.003333914326503873, + "learning_rate": 0.1949610309816122, + "loss": 0.0803, + "num_input_tokens_seen": 14582320, + "step": 16125 + }, + { + "epoch": 4.257093836610796, + "grad_norm": 0.00019802914175670594, + "learning_rate": 0.1949048310487078, + "loss": 0.0794, + "num_input_tokens_seen": 14586800, + "step": 16130 + }, + { + "epoch": 4.2584136201662925, + "grad_norm": 0.002234244719147682, + "learning_rate": 0.19484862419091406, + "loss": 0.0833, + "num_input_tokens_seen": 14591376, + "step": 16135 + }, + { + "epoch": 4.25973340372179, + "grad_norm": 0.0026668887585401535, + "learning_rate": 0.19479241041689893, + "loss": 0.0856, + "num_input_tokens_seen": 14596112, + "step": 16140 + }, + { + "epoch": 4.2610531872772865, + "grad_norm": 0.0027892221696674824, + "learning_rate": 0.19473618973533116, + "loss": 0.0712, + "num_input_tokens_seen": 14600400, + "step": 16145 + }, + { + "epoch": 4.262372970832783, + "grad_norm": 0.003556754905730486, + "learning_rate": 0.19467996215488076, + "loss": 0.1524, + "num_input_tokens_seen": 14604720, + "step": 16150 + }, + { + "epoch": 4.2636927543882805, + "grad_norm": 0.0016665932489559054, + "learning_rate": 0.1946237276842187, + "loss": 0.0818, + "num_input_tokens_seen": 14609520, + "step": 16155 + }, + { + "epoch": 4.265012537943777, + "grad_norm": 0.0033151376992464066, + "learning_rate": 0.19456748633201712, + "loss": 0.0778, + "num_input_tokens_seen": 14614000, + "step": 16160 + }, + { + "epoch": 4.266332321499274, + "grad_norm": 0.001750276773236692, + "learning_rate": 0.194511238106949, + "loss": 0.0792, + "num_input_tokens_seen": 14618864, + "step": 16165 + }, + { + "epoch": 4.267652105054771, + "grad_norm": 0.0015027282061055303, + "learning_rate": 0.19445498301768863, + "loss": 0.0769, + "num_input_tokens_seen": 14623248, + "step": 16170 + }, + { + "epoch": 4.268971888610268, + "grad_norm": 0.0022651394829154015, + "learning_rate": 0.19439872107291126, + "loss": 0.0912, + "num_input_tokens_seen": 14627824, + "step": 16175 + }, + { + "epoch": 4.270291672165765, + "grad_norm": 0.0006651230505667627, + "learning_rate": 0.1943424522812931, + "loss": 0.0698, + "num_input_tokens_seen": 14632304, + "step": 16180 + }, + { + "epoch": 4.271611455721262, + "grad_norm": 0.0029749770183116198, + "learning_rate": 0.19428617665151157, + "loss": 0.0719, + "num_input_tokens_seen": 14637072, + "step": 16185 + }, + { + "epoch": 4.272931239276758, + "grad_norm": 0.0015086885541677475, + "learning_rate": 0.19422989419224507, + "loss": 0.0856, + "num_input_tokens_seen": 14641552, + "step": 16190 + }, + { + "epoch": 4.274251022832256, + "grad_norm": 0.0014711646363139153, + "learning_rate": 0.19417360491217303, + "loss": 0.0608, + "num_input_tokens_seen": 14646224, + "step": 16195 + }, + { + "epoch": 4.275570806387752, + "grad_norm": 0.003032029839232564, + "learning_rate": 0.19411730881997605, + "loss": 0.0992, + "num_input_tokens_seen": 14650736, + "step": 16200 + }, + { + "epoch": 4.275570806387752, + "eval_loss": 0.09948311001062393, + "eval_runtime": 75.7454, + "eval_samples_per_second": 88.916, + "eval_steps_per_second": 22.232, + "num_input_tokens_seen": 14650736, + "step": 16200 + }, + { + "epoch": 4.27689058994325, + "grad_norm": 0.0005395386833697557, + "learning_rate": 0.1940610059243356, + "loss": 0.0698, + "num_input_tokens_seen": 14655088, + "step": 16205 + }, + { + "epoch": 4.278210373498746, + "grad_norm": 0.004521840717643499, + "learning_rate": 0.19400469623393435, + "loss": 0.0722, + "num_input_tokens_seen": 14659664, + "step": 16210 + }, + { + "epoch": 4.279530157054243, + "grad_norm": 0.0011700227623805404, + "learning_rate": 0.1939483797574559, + "loss": 0.0737, + "num_input_tokens_seen": 14664656, + "step": 16215 + }, + { + "epoch": 4.28084994060974, + "grad_norm": 0.003115683328360319, + "learning_rate": 0.19389205650358504, + "loss": 0.081, + "num_input_tokens_seen": 14669328, + "step": 16220 + }, + { + "epoch": 4.282169724165237, + "grad_norm": 0.001657776883803308, + "learning_rate": 0.19383572648100747, + "loss": 0.101, + "num_input_tokens_seen": 14673936, + "step": 16225 + }, + { + "epoch": 4.283489507720734, + "grad_norm": 0.0006857718108221889, + "learning_rate": 0.19377938969841, + "loss": 0.0782, + "num_input_tokens_seen": 14678576, + "step": 16230 + }, + { + "epoch": 4.284809291276231, + "grad_norm": 0.002124596619978547, + "learning_rate": 0.1937230461644805, + "loss": 0.069, + "num_input_tokens_seen": 14683440, + "step": 16235 + }, + { + "epoch": 4.286129074831727, + "grad_norm": 0.0011491281911730766, + "learning_rate": 0.19366669588790777, + "loss": 0.0899, + "num_input_tokens_seen": 14687760, + "step": 16240 + }, + { + "epoch": 4.287448858387225, + "grad_norm": 0.0019783724565058947, + "learning_rate": 0.19361033887738185, + "loss": 0.0452, + "num_input_tokens_seen": 14692496, + "step": 16245 + }, + { + "epoch": 4.288768641942721, + "grad_norm": 0.00217032409273088, + "learning_rate": 0.19355397514159361, + "loss": 0.0592, + "num_input_tokens_seen": 14696848, + "step": 16250 + }, + { + "epoch": 4.290088425498218, + "grad_norm": 0.001509128836914897, + "learning_rate": 0.19349760468923508, + "loss": 0.0833, + "num_input_tokens_seen": 14701520, + "step": 16255 + }, + { + "epoch": 4.291408209053715, + "grad_norm": 0.0011326824314892292, + "learning_rate": 0.19344122752899925, + "loss": 0.0587, + "num_input_tokens_seen": 14706384, + "step": 16260 + }, + { + "epoch": 4.292727992609212, + "grad_norm": 0.0008858708315528929, + "learning_rate": 0.1933848436695802, + "loss": 0.0602, + "num_input_tokens_seen": 14710896, + "step": 16265 + }, + { + "epoch": 4.294047776164709, + "grad_norm": 0.0012151622213423252, + "learning_rate": 0.1933284531196731, + "loss": 0.0666, + "num_input_tokens_seen": 14715600, + "step": 16270 + }, + { + "epoch": 4.295367559720206, + "grad_norm": 0.002614024793729186, + "learning_rate": 0.19327205588797403, + "loss": 0.0662, + "num_input_tokens_seen": 14720176, + "step": 16275 + }, + { + "epoch": 4.2966873432757025, + "grad_norm": 0.0010359641164541245, + "learning_rate": 0.19321565198318014, + "loss": 0.1241, + "num_input_tokens_seen": 14724656, + "step": 16280 + }, + { + "epoch": 4.2980071268312, + "grad_norm": 0.001162207219749689, + "learning_rate": 0.1931592414139896, + "loss": 0.0819, + "num_input_tokens_seen": 14729488, + "step": 16285 + }, + { + "epoch": 4.2993269103866965, + "grad_norm": 0.001754379365593195, + "learning_rate": 0.19310282418910169, + "loss": 0.078, + "num_input_tokens_seen": 14733744, + "step": 16290 + }, + { + "epoch": 4.300646693942193, + "grad_norm": 0.001005414524115622, + "learning_rate": 0.1930464003172166, + "loss": 0.0485, + "num_input_tokens_seen": 14738256, + "step": 16295 + }, + { + "epoch": 4.3019664774976905, + "grad_norm": 0.001672291778959334, + "learning_rate": 0.19298996980703567, + "loss": 0.0905, + "num_input_tokens_seen": 14742800, + "step": 16300 + }, + { + "epoch": 4.303286261053187, + "grad_norm": 0.0035671174991875887, + "learning_rate": 0.19293353266726113, + "loss": 0.092, + "num_input_tokens_seen": 14747344, + "step": 16305 + }, + { + "epoch": 4.3046060446086845, + "grad_norm": 0.001620063791051507, + "learning_rate": 0.19287708890659633, + "loss": 0.0667, + "num_input_tokens_seen": 14751984, + "step": 16310 + }, + { + "epoch": 4.305925828164181, + "grad_norm": 0.002258321037515998, + "learning_rate": 0.19282063853374556, + "loss": 0.1046, + "num_input_tokens_seen": 14756624, + "step": 16315 + }, + { + "epoch": 4.307245611719678, + "grad_norm": 0.0029655720572918653, + "learning_rate": 0.19276418155741423, + "loss": 0.0839, + "num_input_tokens_seen": 14761264, + "step": 16320 + }, + { + "epoch": 4.308565395275175, + "grad_norm": 0.0010150705929845572, + "learning_rate": 0.19270771798630867, + "loss": 0.1052, + "num_input_tokens_seen": 14765904, + "step": 16325 + }, + { + "epoch": 4.309885178830672, + "grad_norm": 0.0014310716651380062, + "learning_rate": 0.1926512478291363, + "loss": 0.0575, + "num_input_tokens_seen": 14770448, + "step": 16330 + }, + { + "epoch": 4.311204962386169, + "grad_norm": 0.002258522668853402, + "learning_rate": 0.19259477109460557, + "loss": 0.07, + "num_input_tokens_seen": 14774832, + "step": 16335 + }, + { + "epoch": 4.312524745941666, + "grad_norm": 0.0014401318039745092, + "learning_rate": 0.19253828779142584, + "loss": 0.0667, + "num_input_tokens_seen": 14779504, + "step": 16340 + }, + { + "epoch": 4.313844529497162, + "grad_norm": 0.0010846547083929181, + "learning_rate": 0.19248179792830755, + "loss": 0.0992, + "num_input_tokens_seen": 14784112, + "step": 16345 + }, + { + "epoch": 4.31516431305266, + "grad_norm": 0.0023932005278766155, + "learning_rate": 0.19242530151396217, + "loss": 0.0906, + "num_input_tokens_seen": 14788624, + "step": 16350 + }, + { + "epoch": 4.316484096608156, + "grad_norm": 0.002633939730003476, + "learning_rate": 0.19236879855710215, + "loss": 0.1071, + "num_input_tokens_seen": 14793744, + "step": 16355 + }, + { + "epoch": 4.317803880163654, + "grad_norm": 0.0026702038012444973, + "learning_rate": 0.19231228906644096, + "loss": 0.1089, + "num_input_tokens_seen": 14798448, + "step": 16360 + }, + { + "epoch": 4.31912366371915, + "grad_norm": 0.0021062707528471947, + "learning_rate": 0.19225577305069302, + "loss": 0.0525, + "num_input_tokens_seen": 14802928, + "step": 16365 + }, + { + "epoch": 4.320443447274647, + "grad_norm": 0.002987529616802931, + "learning_rate": 0.1921992505185739, + "loss": 0.115, + "num_input_tokens_seen": 14807312, + "step": 16370 + }, + { + "epoch": 4.321763230830144, + "grad_norm": 0.001192722818814218, + "learning_rate": 0.19214272147880004, + "loss": 0.0883, + "num_input_tokens_seen": 14811792, + "step": 16375 + }, + { + "epoch": 4.323083014385641, + "grad_norm": 0.0014787226682528853, + "learning_rate": 0.19208618594008892, + "loss": 0.0907, + "num_input_tokens_seen": 14816112, + "step": 16380 + }, + { + "epoch": 4.324402797941137, + "grad_norm": 0.0009363790741190314, + "learning_rate": 0.19202964391115904, + "loss": 0.0486, + "num_input_tokens_seen": 14820432, + "step": 16385 + }, + { + "epoch": 4.325722581496635, + "grad_norm": 0.002837628358975053, + "learning_rate": 0.1919730954007299, + "loss": 0.0864, + "num_input_tokens_seen": 14825200, + "step": 16390 + }, + { + "epoch": 4.327042365052131, + "grad_norm": 0.0017289584502577782, + "learning_rate": 0.19191654041752199, + "loss": 0.0748, + "num_input_tokens_seen": 14829872, + "step": 16395 + }, + { + "epoch": 4.328362148607629, + "grad_norm": 0.002925847889855504, + "learning_rate": 0.19185997897025678, + "loss": 0.1054, + "num_input_tokens_seen": 14834416, + "step": 16400 + }, + { + "epoch": 4.328362148607629, + "eval_loss": 0.09261716902256012, + "eval_runtime": 75.9218, + "eval_samples_per_second": 88.71, + "eval_steps_per_second": 22.181, + "num_input_tokens_seen": 14834416, + "step": 16400 + }, + { + "epoch": 4.329681932163125, + "grad_norm": 0.0024614930152893066, + "learning_rate": 0.19180341106765672, + "loss": 0.0995, + "num_input_tokens_seen": 14838832, + "step": 16405 + }, + { + "epoch": 4.331001715718622, + "grad_norm": 0.0010971666779369116, + "learning_rate": 0.19174683671844536, + "loss": 0.0817, + "num_input_tokens_seen": 14843184, + "step": 16410 + }, + { + "epoch": 4.332321499274119, + "grad_norm": 0.002265132497996092, + "learning_rate": 0.19169025593134717, + "loss": 0.0912, + "num_input_tokens_seen": 14847344, + "step": 16415 + }, + { + "epoch": 4.333641282829616, + "grad_norm": 0.0024155157152563334, + "learning_rate": 0.19163366871508764, + "loss": 0.1174, + "num_input_tokens_seen": 14851824, + "step": 16420 + }, + { + "epoch": 4.334961066385113, + "grad_norm": 0.0018298923969268799, + "learning_rate": 0.19157707507839317, + "loss": 0.1056, + "num_input_tokens_seen": 14856272, + "step": 16425 + }, + { + "epoch": 4.33628084994061, + "grad_norm": 0.0012301866663619876, + "learning_rate": 0.19152047502999123, + "loss": 0.0714, + "num_input_tokens_seen": 14861104, + "step": 16430 + }, + { + "epoch": 4.337600633496106, + "grad_norm": 0.0013586340937763453, + "learning_rate": 0.19146386857861025, + "loss": 0.0714, + "num_input_tokens_seen": 14865904, + "step": 16435 + }, + { + "epoch": 4.338920417051604, + "grad_norm": 0.0018110976088792086, + "learning_rate": 0.19140725573297968, + "loss": 0.0911, + "num_input_tokens_seen": 14870672, + "step": 16440 + }, + { + "epoch": 4.3402402006071, + "grad_norm": 0.0033024849835783243, + "learning_rate": 0.19135063650182987, + "loss": 0.0809, + "num_input_tokens_seen": 14875024, + "step": 16445 + }, + { + "epoch": 4.341559984162597, + "grad_norm": 0.0010583876864984632, + "learning_rate": 0.19129401089389234, + "loss": 0.0693, + "num_input_tokens_seen": 14879760, + "step": 16450 + }, + { + "epoch": 4.342879767718094, + "grad_norm": 0.0015882377047091722, + "learning_rate": 0.19123737891789938, + "loss": 0.0909, + "num_input_tokens_seen": 14884432, + "step": 16455 + }, + { + "epoch": 4.344199551273591, + "grad_norm": 0.0014093086356297135, + "learning_rate": 0.19118074058258439, + "loss": 0.0717, + "num_input_tokens_seen": 14888784, + "step": 16460 + }, + { + "epoch": 4.345519334829088, + "grad_norm": 0.0013023287756368518, + "learning_rate": 0.1911240958966816, + "loss": 0.0532, + "num_input_tokens_seen": 14893360, + "step": 16465 + }, + { + "epoch": 4.346839118384585, + "grad_norm": 0.0011392422020435333, + "learning_rate": 0.19106744486892652, + "loss": 0.0982, + "num_input_tokens_seen": 14897904, + "step": 16470 + }, + { + "epoch": 4.3481589019400815, + "grad_norm": 0.002121577039361, + "learning_rate": 0.1910107875080553, + "loss": 0.0717, + "num_input_tokens_seen": 14902192, + "step": 16475 + }, + { + "epoch": 4.349478685495579, + "grad_norm": 0.0014983314322307706, + "learning_rate": 0.19095412382280533, + "loss": 0.0512, + "num_input_tokens_seen": 14906352, + "step": 16480 + }, + { + "epoch": 4.3507984690510755, + "grad_norm": 0.003454516176134348, + "learning_rate": 0.19089745382191473, + "loss": 0.0767, + "num_input_tokens_seen": 14911120, + "step": 16485 + }, + { + "epoch": 4.352118252606573, + "grad_norm": 0.0023951553739607334, + "learning_rate": 0.19084077751412284, + "loss": 0.0725, + "num_input_tokens_seen": 14915440, + "step": 16490 + }, + { + "epoch": 4.3534380361620695, + "grad_norm": 0.002555791288614273, + "learning_rate": 0.19078409490816986, + "loss": 0.1051, + "num_input_tokens_seen": 14919920, + "step": 16495 + }, + { + "epoch": 4.354757819717566, + "grad_norm": 0.003245303872972727, + "learning_rate": 0.19072740601279686, + "loss": 0.0945, + "num_input_tokens_seen": 14924496, + "step": 16500 + }, + { + "epoch": 4.3560776032730635, + "grad_norm": 0.003843435551971197, + "learning_rate": 0.19067071083674605, + "loss": 0.0664, + "num_input_tokens_seen": 14928912, + "step": 16505 + }, + { + "epoch": 4.35739738682856, + "grad_norm": 0.0034361514262855053, + "learning_rate": 0.19061400938876052, + "loss": 0.096, + "num_input_tokens_seen": 14933488, + "step": 16510 + }, + { + "epoch": 4.3587171703840575, + "grad_norm": 0.0016447732923552394, + "learning_rate": 0.1905573016775844, + "loss": 0.0595, + "num_input_tokens_seen": 14937808, + "step": 16515 + }, + { + "epoch": 4.360036953939554, + "grad_norm": 0.0017252584220841527, + "learning_rate": 0.19050058771196263, + "loss": 0.065, + "num_input_tokens_seen": 14942384, + "step": 16520 + }, + { + "epoch": 4.361356737495051, + "grad_norm": 0.0011154164094477892, + "learning_rate": 0.19044386750064132, + "loss": 0.0949, + "num_input_tokens_seen": 14947088, + "step": 16525 + }, + { + "epoch": 4.362676521050548, + "grad_norm": 0.0017616734839975834, + "learning_rate": 0.19038714105236737, + "loss": 0.0917, + "num_input_tokens_seen": 14951664, + "step": 16530 + }, + { + "epoch": 4.363996304606045, + "grad_norm": 0.0020959428511559963, + "learning_rate": 0.19033040837588874, + "loss": 0.0432, + "num_input_tokens_seen": 14956304, + "step": 16535 + }, + { + "epoch": 4.365316088161541, + "grad_norm": 0.002167697297409177, + "learning_rate": 0.1902736694799543, + "loss": 0.082, + "num_input_tokens_seen": 14960784, + "step": 16540 + }, + { + "epoch": 4.366635871717039, + "grad_norm": 0.0013329698704183102, + "learning_rate": 0.19021692437331392, + "loss": 0.0666, + "num_input_tokens_seen": 14965456, + "step": 16545 + }, + { + "epoch": 4.367955655272535, + "grad_norm": 0.002060298342257738, + "learning_rate": 0.1901601730647184, + "loss": 0.0651, + "num_input_tokens_seen": 14969936, + "step": 16550 + }, + { + "epoch": 4.369275438828033, + "grad_norm": 0.0007308796048164368, + "learning_rate": 0.19010341556291954, + "loss": 0.0745, + "num_input_tokens_seen": 14974640, + "step": 16555 + }, + { + "epoch": 4.370595222383529, + "grad_norm": 0.0018809428438544273, + "learning_rate": 0.19004665187667, + "loss": 0.1071, + "num_input_tokens_seen": 14979056, + "step": 16560 + }, + { + "epoch": 4.371915005939026, + "grad_norm": 0.0009160954505205154, + "learning_rate": 0.1899898820147235, + "loss": 0.079, + "num_input_tokens_seen": 14983472, + "step": 16565 + }, + { + "epoch": 4.373234789494523, + "grad_norm": 0.0026998736429959536, + "learning_rate": 0.18993310598583465, + "loss": 0.0804, + "num_input_tokens_seen": 14987984, + "step": 16570 + }, + { + "epoch": 4.37455457305002, + "grad_norm": 0.0015691305743530393, + "learning_rate": 0.18987632379875904, + "loss": 0.0615, + "num_input_tokens_seen": 14992304, + "step": 16575 + }, + { + "epoch": 4.375874356605516, + "grad_norm": 0.0015522887697443366, + "learning_rate": 0.18981953546225314, + "loss": 0.0961, + "num_input_tokens_seen": 14996880, + "step": 16580 + }, + { + "epoch": 4.377194140161014, + "grad_norm": 0.0009146786178462207, + "learning_rate": 0.18976274098507445, + "loss": 0.0818, + "num_input_tokens_seen": 15001552, + "step": 16585 + }, + { + "epoch": 4.37851392371651, + "grad_norm": 0.0012138418387621641, + "learning_rate": 0.18970594037598146, + "loss": 0.0813, + "num_input_tokens_seen": 15005936, + "step": 16590 + }, + { + "epoch": 4.379833707272008, + "grad_norm": 0.0015267785638570786, + "learning_rate": 0.1896491336437335, + "loss": 0.0666, + "num_input_tokens_seen": 15010416, + "step": 16595 + }, + { + "epoch": 4.381153490827504, + "grad_norm": 0.002133936621248722, + "learning_rate": 0.18959232079709085, + "loss": 0.0435, + "num_input_tokens_seen": 15014800, + "step": 16600 + }, + { + "epoch": 4.381153490827504, + "eval_loss": 0.09952955693006516, + "eval_runtime": 75.8999, + "eval_samples_per_second": 88.735, + "eval_steps_per_second": 22.187, + "num_input_tokens_seen": 15014800, + "step": 16600 + }, + { + "epoch": 4.382473274383001, + "grad_norm": 0.0033856474328786135, + "learning_rate": 0.18953550184481477, + "loss": 0.1029, + "num_input_tokens_seen": 15019248, + "step": 16605 + }, + { + "epoch": 4.383793057938498, + "grad_norm": 0.00317897810600698, + "learning_rate": 0.18947867679566752, + "loss": 0.0862, + "num_input_tokens_seen": 15023536, + "step": 16610 + }, + { + "epoch": 4.385112841493995, + "grad_norm": 0.0009571745758876204, + "learning_rate": 0.18942184565841216, + "loss": 0.07, + "num_input_tokens_seen": 15028112, + "step": 16615 + }, + { + "epoch": 4.386432625049492, + "grad_norm": 0.0025460084434598684, + "learning_rate": 0.18936500844181278, + "loss": 0.0781, + "num_input_tokens_seen": 15032464, + "step": 16620 + }, + { + "epoch": 4.387752408604989, + "grad_norm": 0.0018651766004040837, + "learning_rate": 0.18930816515463436, + "loss": 0.0721, + "num_input_tokens_seen": 15037200, + "step": 16625 + }, + { + "epoch": 4.389072192160485, + "grad_norm": 0.0012866948964074254, + "learning_rate": 0.18925131580564297, + "loss": 0.0626, + "num_input_tokens_seen": 15041872, + "step": 16630 + }, + { + "epoch": 4.390391975715983, + "grad_norm": 0.001452346215955913, + "learning_rate": 0.1891944604036054, + "loss": 0.0527, + "num_input_tokens_seen": 15046512, + "step": 16635 + }, + { + "epoch": 4.391711759271479, + "grad_norm": 0.0016137767815962434, + "learning_rate": 0.1891375989572895, + "loss": 0.0589, + "num_input_tokens_seen": 15050800, + "step": 16640 + }, + { + "epoch": 4.393031542826977, + "grad_norm": 0.0017902019899338484, + "learning_rate": 0.18908073147546398, + "loss": 0.072, + "num_input_tokens_seen": 15055472, + "step": 16645 + }, + { + "epoch": 4.394351326382473, + "grad_norm": 0.0008965583983808756, + "learning_rate": 0.18902385796689858, + "loss": 0.0823, + "num_input_tokens_seen": 15060080, + "step": 16650 + }, + { + "epoch": 4.39567110993797, + "grad_norm": 0.0028356527909636497, + "learning_rate": 0.18896697844036384, + "loss": 0.1474, + "num_input_tokens_seen": 15064400, + "step": 16655 + }, + { + "epoch": 4.396990893493467, + "grad_norm": 0.002271860372275114, + "learning_rate": 0.18891009290463137, + "loss": 0.0741, + "num_input_tokens_seen": 15068784, + "step": 16660 + }, + { + "epoch": 4.398310677048964, + "grad_norm": 0.0030256123282015324, + "learning_rate": 0.18885320136847353, + "loss": 0.1105, + "num_input_tokens_seen": 15073424, + "step": 16665 + }, + { + "epoch": 4.3996304606044605, + "grad_norm": 0.003968017641454935, + "learning_rate": 0.1887963038406639, + "loss": 0.0956, + "num_input_tokens_seen": 15077968, + "step": 16670 + }, + { + "epoch": 4.400950244159958, + "grad_norm": 0.0024667507968842983, + "learning_rate": 0.18873940032997658, + "loss": 0.1269, + "num_input_tokens_seen": 15082416, + "step": 16675 + }, + { + "epoch": 4.4022700277154545, + "grad_norm": 0.0013899084879085422, + "learning_rate": 0.18868249084518693, + "loss": 0.0977, + "num_input_tokens_seen": 15086544, + "step": 16680 + }, + { + "epoch": 4.403589811270952, + "grad_norm": 0.0016157394275069237, + "learning_rate": 0.18862557539507102, + "loss": 0.1105, + "num_input_tokens_seen": 15090928, + "step": 16685 + }, + { + "epoch": 4.4049095948264485, + "grad_norm": 0.001172735937871039, + "learning_rate": 0.18856865398840605, + "loss": 0.0825, + "num_input_tokens_seen": 15095248, + "step": 16690 + }, + { + "epoch": 4.406229378381945, + "grad_norm": 0.0017088048625737429, + "learning_rate": 0.18851172663396995, + "loss": 0.0724, + "num_input_tokens_seen": 15099600, + "step": 16695 + }, + { + "epoch": 4.4075491619374425, + "grad_norm": 0.002629259368404746, + "learning_rate": 0.1884547933405416, + "loss": 0.0748, + "num_input_tokens_seen": 15104336, + "step": 16700 + }, + { + "epoch": 4.408868945492939, + "grad_norm": 0.0014877525391057134, + "learning_rate": 0.1883978541169009, + "loss": 0.0689, + "num_input_tokens_seen": 15108816, + "step": 16705 + }, + { + "epoch": 4.410188729048436, + "grad_norm": 0.0011282548075541854, + "learning_rate": 0.18834090897182854, + "loss": 0.0852, + "num_input_tokens_seen": 15113072, + "step": 16710 + }, + { + "epoch": 4.411508512603933, + "grad_norm": 0.0022164001129567623, + "learning_rate": 0.1882839579141062, + "loss": 0.0545, + "num_input_tokens_seen": 15117552, + "step": 16715 + }, + { + "epoch": 4.41282829615943, + "grad_norm": 0.0018933705287054181, + "learning_rate": 0.18822700095251646, + "loss": 0.073, + "num_input_tokens_seen": 15122064, + "step": 16720 + }, + { + "epoch": 4.414148079714927, + "grad_norm": 0.0018605145160108805, + "learning_rate": 0.18817003809584273, + "loss": 0.1014, + "num_input_tokens_seen": 15126512, + "step": 16725 + }, + { + "epoch": 4.415467863270424, + "grad_norm": 0.0016813104739412665, + "learning_rate": 0.1881130693528695, + "loss": 0.081, + "num_input_tokens_seen": 15130960, + "step": 16730 + }, + { + "epoch": 4.41678764682592, + "grad_norm": 0.0008135170792229474, + "learning_rate": 0.18805609473238197, + "loss": 0.0698, + "num_input_tokens_seen": 15135632, + "step": 16735 + }, + { + "epoch": 4.418107430381418, + "grad_norm": 0.002234196290373802, + "learning_rate": 0.18799911424316643, + "loss": 0.0772, + "num_input_tokens_seen": 15140208, + "step": 16740 + }, + { + "epoch": 4.419427213936914, + "grad_norm": 0.0017149040941148996, + "learning_rate": 0.18794212789400994, + "loss": 0.0605, + "num_input_tokens_seen": 15144688, + "step": 16745 + }, + { + "epoch": 4.420746997492412, + "grad_norm": 0.0014463280094787478, + "learning_rate": 0.18788513569370052, + "loss": 0.0931, + "num_input_tokens_seen": 15149360, + "step": 16750 + }, + { + "epoch": 4.422066781047908, + "grad_norm": 0.0015662484802305698, + "learning_rate": 0.1878281376510271, + "loss": 0.0759, + "num_input_tokens_seen": 15153936, + "step": 16755 + }, + { + "epoch": 4.423386564603405, + "grad_norm": 0.0022486140951514244, + "learning_rate": 0.18777113377477941, + "loss": 0.0668, + "num_input_tokens_seen": 15158384, + "step": 16760 + }, + { + "epoch": 4.424706348158902, + "grad_norm": 0.0010957913473248482, + "learning_rate": 0.1877141240737483, + "loss": 0.0458, + "num_input_tokens_seen": 15163248, + "step": 16765 + }, + { + "epoch": 4.426026131714399, + "grad_norm": 0.0018056387780234218, + "learning_rate": 0.18765710855672527, + "loss": 0.0861, + "num_input_tokens_seen": 15167408, + "step": 16770 + }, + { + "epoch": 4.427345915269896, + "grad_norm": 0.0009274277254007757, + "learning_rate": 0.18760008723250288, + "loss": 0.094, + "num_input_tokens_seen": 15171760, + "step": 16775 + }, + { + "epoch": 4.428665698825393, + "grad_norm": 0.0035543562844395638, + "learning_rate": 0.18754306010987457, + "loss": 0.0805, + "num_input_tokens_seen": 15176656, + "step": 16780 + }, + { + "epoch": 4.429985482380889, + "grad_norm": 0.002372755203396082, + "learning_rate": 0.18748602719763457, + "loss": 0.0609, + "num_input_tokens_seen": 15181040, + "step": 16785 + }, + { + "epoch": 4.431305265936387, + "grad_norm": 0.0022032924462109804, + "learning_rate": 0.18742898850457804, + "loss": 0.0595, + "num_input_tokens_seen": 15185264, + "step": 16790 + }, + { + "epoch": 4.432625049491883, + "grad_norm": 0.0011767216492444277, + "learning_rate": 0.1873719440395012, + "loss": 0.1039, + "num_input_tokens_seen": 15189456, + "step": 16795 + }, + { + "epoch": 4.43394483304738, + "grad_norm": 0.0028589048888534307, + "learning_rate": 0.1873148938112009, + "loss": 0.0581, + "num_input_tokens_seen": 15194064, + "step": 16800 + }, + { + "epoch": 4.43394483304738, + "eval_loss": 0.09733599424362183, + "eval_runtime": 75.9013, + "eval_samples_per_second": 88.734, + "eval_steps_per_second": 22.187, + "num_input_tokens_seen": 15194064, + "step": 16800 + }, + { + "epoch": 4.435264616602877, + "grad_norm": 0.001685268827714026, + "learning_rate": 0.18725783782847508, + "loss": 0.059, + "num_input_tokens_seen": 15198768, + "step": 16805 + }, + { + "epoch": 4.436584400158374, + "grad_norm": 0.0003931908286176622, + "learning_rate": 0.1872007761001224, + "loss": 0.0563, + "num_input_tokens_seen": 15203216, + "step": 16810 + }, + { + "epoch": 4.437904183713871, + "grad_norm": 0.0022517689503729343, + "learning_rate": 0.1871437086349426, + "loss": 0.0832, + "num_input_tokens_seen": 15207472, + "step": 16815 + }, + { + "epoch": 4.439223967269368, + "grad_norm": 0.0035817930474877357, + "learning_rate": 0.18708663544173615, + "loss": 0.1197, + "num_input_tokens_seen": 15212176, + "step": 16820 + }, + { + "epoch": 4.440543750824864, + "grad_norm": 0.0011754566803574562, + "learning_rate": 0.18702955652930442, + "loss": 0.0797, + "num_input_tokens_seen": 15216848, + "step": 16825 + }, + { + "epoch": 4.441863534380362, + "grad_norm": 0.0025875302962958813, + "learning_rate": 0.18697247190644972, + "loss": 0.0607, + "num_input_tokens_seen": 15221616, + "step": 16830 + }, + { + "epoch": 4.443183317935858, + "grad_norm": 0.0005159539869055152, + "learning_rate": 0.18691538158197527, + "loss": 0.0797, + "num_input_tokens_seen": 15226320, + "step": 16835 + }, + { + "epoch": 4.444503101491355, + "grad_norm": 0.0017835496691986918, + "learning_rate": 0.1868582855646851, + "loss": 0.0618, + "num_input_tokens_seen": 15231120, + "step": 16840 + }, + { + "epoch": 4.445822885046852, + "grad_norm": 0.0033846755977720022, + "learning_rate": 0.18680118386338404, + "loss": 0.1014, + "num_input_tokens_seen": 15235952, + "step": 16845 + }, + { + "epoch": 4.447142668602349, + "grad_norm": 0.0029132238123565912, + "learning_rate": 0.18674407648687794, + "loss": 0.1008, + "num_input_tokens_seen": 15240496, + "step": 16850 + }, + { + "epoch": 4.448462452157846, + "grad_norm": 0.002390059642493725, + "learning_rate": 0.1866869634439736, + "loss": 0.0584, + "num_input_tokens_seen": 15245104, + "step": 16855 + }, + { + "epoch": 4.449782235713343, + "grad_norm": 0.0010457377647981048, + "learning_rate": 0.18662984474347838, + "loss": 0.0662, + "num_input_tokens_seen": 15249744, + "step": 16860 + }, + { + "epoch": 4.4511020192688395, + "grad_norm": 0.0013416989240795374, + "learning_rate": 0.1865727203942008, + "loss": 0.0748, + "num_input_tokens_seen": 15254576, + "step": 16865 + }, + { + "epoch": 4.452421802824337, + "grad_norm": 0.002256252570077777, + "learning_rate": 0.1865155904049501, + "loss": 0.0838, + "num_input_tokens_seen": 15259248, + "step": 16870 + }, + { + "epoch": 4.4537415863798335, + "grad_norm": 0.0021938346326351166, + "learning_rate": 0.1864584547845365, + "loss": 0.0535, + "num_input_tokens_seen": 15263792, + "step": 16875 + }, + { + "epoch": 4.455061369935331, + "grad_norm": 0.0013779315631836653, + "learning_rate": 0.186401313541771, + "loss": 0.0523, + "num_input_tokens_seen": 15268432, + "step": 16880 + }, + { + "epoch": 4.4563811534908275, + "grad_norm": 0.0034471945837140083, + "learning_rate": 0.18634416668546552, + "loss": 0.0727, + "num_input_tokens_seen": 15272976, + "step": 16885 + }, + { + "epoch": 4.457700937046324, + "grad_norm": 0.0031721112318336964, + "learning_rate": 0.1862870142244328, + "loss": 0.1364, + "num_input_tokens_seen": 15277264, + "step": 16890 + }, + { + "epoch": 4.4590207206018215, + "grad_norm": 0.002016256796196103, + "learning_rate": 0.1862298561674865, + "loss": 0.068, + "num_input_tokens_seen": 15281712, + "step": 16895 + }, + { + "epoch": 4.460340504157318, + "grad_norm": 0.0014414418255910277, + "learning_rate": 0.18617269252344104, + "loss": 0.0758, + "num_input_tokens_seen": 15286384, + "step": 16900 + }, + { + "epoch": 4.4616602877128155, + "grad_norm": 0.0009491029195487499, + "learning_rate": 0.18611552330111186, + "loss": 0.0811, + "num_input_tokens_seen": 15290832, + "step": 16905 + }, + { + "epoch": 4.462980071268312, + "grad_norm": 0.0034008531365543604, + "learning_rate": 0.18605834850931507, + "loss": 0.0537, + "num_input_tokens_seen": 15295440, + "step": 16910 + }, + { + "epoch": 4.464299854823809, + "grad_norm": 0.0022209922317415476, + "learning_rate": 0.18600116815686787, + "loss": 0.068, + "num_input_tokens_seen": 15299952, + "step": 16915 + }, + { + "epoch": 4.465619638379306, + "grad_norm": 0.0013254106743261218, + "learning_rate": 0.1859439822525881, + "loss": 0.0895, + "num_input_tokens_seen": 15304368, + "step": 16920 + }, + { + "epoch": 4.466939421934803, + "grad_norm": 0.0010311888763681054, + "learning_rate": 0.18588679080529455, + "loss": 0.0766, + "num_input_tokens_seen": 15308720, + "step": 16925 + }, + { + "epoch": 4.468259205490299, + "grad_norm": 0.0029073646292090416, + "learning_rate": 0.1858295938238069, + "loss": 0.0646, + "num_input_tokens_seen": 15313008, + "step": 16930 + }, + { + "epoch": 4.469578989045797, + "grad_norm": 0.001058906433172524, + "learning_rate": 0.18577239131694562, + "loss": 0.0735, + "num_input_tokens_seen": 15317936, + "step": 16935 + }, + { + "epoch": 4.470898772601293, + "grad_norm": 0.00379694951698184, + "learning_rate": 0.18571518329353204, + "loss": 0.0905, + "num_input_tokens_seen": 15322768, + "step": 16940 + }, + { + "epoch": 4.472218556156791, + "grad_norm": 0.0018635065061971545, + "learning_rate": 0.18565796976238838, + "loss": 0.1081, + "num_input_tokens_seen": 15327344, + "step": 16945 + }, + { + "epoch": 4.473538339712287, + "grad_norm": 0.004798540845513344, + "learning_rate": 0.18560075073233764, + "loss": 0.0971, + "num_input_tokens_seen": 15331728, + "step": 16950 + }, + { + "epoch": 4.474858123267784, + "grad_norm": 0.0017048298614099622, + "learning_rate": 0.18554352621220377, + "loss": 0.0684, + "num_input_tokens_seen": 15336144, + "step": 16955 + }, + { + "epoch": 4.476177906823281, + "grad_norm": 0.000953376351390034, + "learning_rate": 0.18548629621081153, + "loss": 0.0475, + "num_input_tokens_seen": 15340528, + "step": 16960 + }, + { + "epoch": 4.477497690378778, + "grad_norm": 0.0015172215644270182, + "learning_rate": 0.18542906073698645, + "loss": 0.0409, + "num_input_tokens_seen": 15345136, + "step": 16965 + }, + { + "epoch": 4.478817473934274, + "grad_norm": 0.0027916072867810726, + "learning_rate": 0.18537181979955494, + "loss": 0.0836, + "num_input_tokens_seen": 15349904, + "step": 16970 + }, + { + "epoch": 4.480137257489772, + "grad_norm": 0.0026530767790973186, + "learning_rate": 0.18531457340734434, + "loss": 0.0656, + "num_input_tokens_seen": 15354128, + "step": 16975 + }, + { + "epoch": 4.481457041045268, + "grad_norm": 0.0024872818030416965, + "learning_rate": 0.1852573215691827, + "loss": 0.0774, + "num_input_tokens_seen": 15358320, + "step": 16980 + }, + { + "epoch": 4.482776824600766, + "grad_norm": 0.0024473017547279596, + "learning_rate": 0.18520006429389904, + "loss": 0.0842, + "num_input_tokens_seen": 15362704, + "step": 16985 + }, + { + "epoch": 4.484096608156262, + "grad_norm": 0.0008607142372056842, + "learning_rate": 0.1851428015903231, + "loss": 0.07, + "num_input_tokens_seen": 15367344, + "step": 16990 + }, + { + "epoch": 4.485416391711759, + "grad_norm": 0.00220400164835155, + "learning_rate": 0.1850855334672855, + "loss": 0.0552, + "num_input_tokens_seen": 15371920, + "step": 16995 + }, + { + "epoch": 4.486736175267256, + "grad_norm": 0.0021196703892201185, + "learning_rate": 0.1850282599336178, + "loss": 0.0435, + "num_input_tokens_seen": 15376368, + "step": 17000 + }, + { + "epoch": 4.486736175267256, + "eval_loss": 0.09600570797920227, + "eval_runtime": 77.6856, + "eval_samples_per_second": 86.696, + "eval_steps_per_second": 21.677, + "num_input_tokens_seen": 15376368, + "step": 17000 + }, + { + "epoch": 4.488055958822753, + "grad_norm": 0.0022884795907884836, + "learning_rate": 0.18497098099815215, + "loss": 0.0517, + "num_input_tokens_seen": 15380880, + "step": 17005 + }, + { + "epoch": 4.48937574237825, + "grad_norm": 0.0021293452009558678, + "learning_rate": 0.18491369666972174, + "loss": 0.1062, + "num_input_tokens_seen": 15385296, + "step": 17010 + }, + { + "epoch": 4.490695525933747, + "grad_norm": 0.0011511960765346885, + "learning_rate": 0.1848564069571606, + "loss": 0.0791, + "num_input_tokens_seen": 15390032, + "step": 17015 + }, + { + "epoch": 4.492015309489243, + "grad_norm": 0.0023191391956061125, + "learning_rate": 0.18479911186930348, + "loss": 0.0738, + "num_input_tokens_seen": 15394448, + "step": 17020 + }, + { + "epoch": 4.493335093044741, + "grad_norm": 0.0033607513178139925, + "learning_rate": 0.18474181141498597, + "loss": 0.1314, + "num_input_tokens_seen": 15399088, + "step": 17025 + }, + { + "epoch": 4.494654876600237, + "grad_norm": 0.0025984866078943014, + "learning_rate": 0.18468450560304453, + "loss": 0.135, + "num_input_tokens_seen": 15403600, + "step": 17030 + }, + { + "epoch": 4.495974660155735, + "grad_norm": 0.0019274074584245682, + "learning_rate": 0.1846271944423165, + "loss": 0.0904, + "num_input_tokens_seen": 15407984, + "step": 17035 + }, + { + "epoch": 4.497294443711231, + "grad_norm": 0.0010533775202929974, + "learning_rate": 0.18456987794163993, + "loss": 0.0538, + "num_input_tokens_seen": 15412624, + "step": 17040 + }, + { + "epoch": 4.498614227266728, + "grad_norm": 0.002149230567738414, + "learning_rate": 0.18451255610985373, + "loss": 0.109, + "num_input_tokens_seen": 15416816, + "step": 17045 + }, + { + "epoch": 4.499934010822225, + "grad_norm": 0.0025713411159813404, + "learning_rate": 0.18445522895579766, + "loss": 0.0843, + "num_input_tokens_seen": 15421296, + "step": 17050 + }, + { + "epoch": 4.501253794377722, + "grad_norm": 0.002635725773870945, + "learning_rate": 0.1843978964883123, + "loss": 0.0665, + "num_input_tokens_seen": 15425840, + "step": 17055 + }, + { + "epoch": 4.502573577933219, + "grad_norm": 0.0014056903310120106, + "learning_rate": 0.18434055871623906, + "loss": 0.0738, + "num_input_tokens_seen": 15430288, + "step": 17060 + }, + { + "epoch": 4.503893361488716, + "grad_norm": 0.0007264007581397891, + "learning_rate": 0.18428321564842007, + "loss": 0.0591, + "num_input_tokens_seen": 15435024, + "step": 17065 + }, + { + "epoch": 4.5052131450442126, + "grad_norm": 0.001577995833940804, + "learning_rate": 0.18422586729369841, + "loss": 0.055, + "num_input_tokens_seen": 15439312, + "step": 17070 + }, + { + "epoch": 4.50653292859971, + "grad_norm": 0.002571406541392207, + "learning_rate": 0.1841685136609179, + "loss": 0.0579, + "num_input_tokens_seen": 15443760, + "step": 17075 + }, + { + "epoch": 4.5078527121552066, + "grad_norm": 0.0035736344289034605, + "learning_rate": 0.18411115475892326, + "loss": 0.0809, + "num_input_tokens_seen": 15448400, + "step": 17080 + }, + { + "epoch": 4.509172495710703, + "grad_norm": 0.001474549644626677, + "learning_rate": 0.18405379059655982, + "loss": 0.0891, + "num_input_tokens_seen": 15452912, + "step": 17085 + }, + { + "epoch": 4.5104922792662006, + "grad_norm": 0.0019390624947845936, + "learning_rate": 0.1839964211826739, + "loss": 0.0523, + "num_input_tokens_seen": 15457648, + "step": 17090 + }, + { + "epoch": 4.511812062821697, + "grad_norm": 0.0011202064342796803, + "learning_rate": 0.18393904652611265, + "loss": 0.0626, + "num_input_tokens_seen": 15462320, + "step": 17095 + }, + { + "epoch": 4.513131846377194, + "grad_norm": 0.00209820456802845, + "learning_rate": 0.18388166663572392, + "loss": 0.0876, + "num_input_tokens_seen": 15466992, + "step": 17100 + }, + { + "epoch": 4.514451629932691, + "grad_norm": 0.0026799228508025408, + "learning_rate": 0.18382428152035643, + "loss": 0.0497, + "num_input_tokens_seen": 15471728, + "step": 17105 + }, + { + "epoch": 4.515771413488188, + "grad_norm": 0.002064944477751851, + "learning_rate": 0.1837668911888596, + "loss": 0.078, + "num_input_tokens_seen": 15475984, + "step": 17110 + }, + { + "epoch": 4.517091197043685, + "grad_norm": 0.0025182520039379597, + "learning_rate": 0.18370949565008388, + "loss": 0.1166, + "num_input_tokens_seen": 15480208, + "step": 17115 + }, + { + "epoch": 4.518410980599182, + "grad_norm": 0.002872597426176071, + "learning_rate": 0.1836520949128803, + "loss": 0.0591, + "num_input_tokens_seen": 15484880, + "step": 17120 + }, + { + "epoch": 4.519730764154678, + "grad_norm": 0.0020139412954449654, + "learning_rate": 0.18359468898610076, + "loss": 0.0885, + "num_input_tokens_seen": 15489424, + "step": 17125 + }, + { + "epoch": 4.521050547710176, + "grad_norm": 0.002816414926201105, + "learning_rate": 0.18353727787859797, + "loss": 0.072, + "num_input_tokens_seen": 15493808, + "step": 17130 + }, + { + "epoch": 4.522370331265672, + "grad_norm": 0.0007951182778924704, + "learning_rate": 0.18347986159922552, + "loss": 0.1094, + "num_input_tokens_seen": 15498352, + "step": 17135 + }, + { + "epoch": 4.52369011482117, + "grad_norm": 0.0007492451695725322, + "learning_rate": 0.1834224401568377, + "loss": 0.0574, + "num_input_tokens_seen": 15502832, + "step": 17140 + }, + { + "epoch": 4.525009898376666, + "grad_norm": 0.003357319161295891, + "learning_rate": 0.1833650135602896, + "loss": 0.1128, + "num_input_tokens_seen": 15507312, + "step": 17145 + }, + { + "epoch": 4.526329681932163, + "grad_norm": 0.0008887313888408244, + "learning_rate": 0.18330758181843707, + "loss": 0.0651, + "num_input_tokens_seen": 15511472, + "step": 17150 + }, + { + "epoch": 4.52764946548766, + "grad_norm": 0.0017255865968763828, + "learning_rate": 0.18325014494013686, + "loss": 0.083, + "num_input_tokens_seen": 15516176, + "step": 17155 + }, + { + "epoch": 4.528969249043157, + "grad_norm": 0.0035103734117001295, + "learning_rate": 0.18319270293424647, + "loss": 0.0647, + "num_input_tokens_seen": 15520720, + "step": 17160 + }, + { + "epoch": 4.530289032598654, + "grad_norm": 0.0018305330304428935, + "learning_rate": 0.18313525580962417, + "loss": 0.104, + "num_input_tokens_seen": 15525104, + "step": 17165 + }, + { + "epoch": 4.531608816154151, + "grad_norm": 0.0020815758034586906, + "learning_rate": 0.18307780357512896, + "loss": 0.077, + "num_input_tokens_seen": 15529648, + "step": 17170 + }, + { + "epoch": 4.532928599709647, + "grad_norm": 0.001201254315674305, + "learning_rate": 0.1830203462396208, + "loss": 0.0775, + "num_input_tokens_seen": 15534096, + "step": 17175 + }, + { + "epoch": 4.534248383265145, + "grad_norm": 0.0005726496456190944, + "learning_rate": 0.18296288381196033, + "loss": 0.0423, + "num_input_tokens_seen": 15538576, + "step": 17180 + }, + { + "epoch": 4.535568166820641, + "grad_norm": 0.003299225587397814, + "learning_rate": 0.1829054163010089, + "loss": 0.0687, + "num_input_tokens_seen": 15543024, + "step": 17185 + }, + { + "epoch": 4.536887950376139, + "grad_norm": 0.0013872559648007154, + "learning_rate": 0.18284794371562874, + "loss": 0.0746, + "num_input_tokens_seen": 15547280, + "step": 17190 + }, + { + "epoch": 4.538207733931635, + "grad_norm": 0.002069941023364663, + "learning_rate": 0.18279046606468288, + "loss": 0.0919, + "num_input_tokens_seen": 15551984, + "step": 17195 + }, + { + "epoch": 4.539527517487132, + "grad_norm": 0.0009614035370759666, + "learning_rate": 0.1827329833570351, + "loss": 0.0862, + "num_input_tokens_seen": 15556464, + "step": 17200 + }, + { + "epoch": 4.539527517487132, + "eval_loss": 0.0943543016910553, + "eval_runtime": 75.8889, + "eval_samples_per_second": 88.748, + "eval_steps_per_second": 22.19, + "num_input_tokens_seen": 15556464, + "step": 17200 + }, + { + "epoch": 4.540847301042629, + "grad_norm": 0.002252102829515934, + "learning_rate": 0.18267549560154991, + "loss": 0.0891, + "num_input_tokens_seen": 15560976, + "step": 17205 + }, + { + "epoch": 4.542167084598126, + "grad_norm": 0.0019103550584986806, + "learning_rate": 0.18261800280709267, + "loss": 0.0849, + "num_input_tokens_seen": 15565872, + "step": 17210 + }, + { + "epoch": 4.5434868681536225, + "grad_norm": 0.0051335543394088745, + "learning_rate": 0.18256050498252957, + "loss": 0.1273, + "num_input_tokens_seen": 15570704, + "step": 17215 + }, + { + "epoch": 4.54480665170912, + "grad_norm": 0.002699080854654312, + "learning_rate": 0.18250300213672735, + "loss": 0.0944, + "num_input_tokens_seen": 15575312, + "step": 17220 + }, + { + "epoch": 4.5461264352646165, + "grad_norm": 0.000670677749440074, + "learning_rate": 0.18244549427855378, + "loss": 0.0757, + "num_input_tokens_seen": 15579824, + "step": 17225 + }, + { + "epoch": 4.547446218820113, + "grad_norm": 0.0017720736796036363, + "learning_rate": 0.1823879814168772, + "loss": 0.0607, + "num_input_tokens_seen": 15584624, + "step": 17230 + }, + { + "epoch": 4.5487660023756105, + "grad_norm": 0.0022304286248981953, + "learning_rate": 0.18233046356056692, + "loss": 0.0828, + "num_input_tokens_seen": 15589200, + "step": 17235 + }, + { + "epoch": 4.550085785931107, + "grad_norm": 0.0011202323948964477, + "learning_rate": 0.18227294071849284, + "loss": 0.051, + "num_input_tokens_seen": 15593616, + "step": 17240 + }, + { + "epoch": 4.5514055694866045, + "grad_norm": 0.002453416818752885, + "learning_rate": 0.18221541289952578, + "loss": 0.0887, + "num_input_tokens_seen": 15598128, + "step": 17245 + }, + { + "epoch": 4.552725353042101, + "grad_norm": 0.0010223020799458027, + "learning_rate": 0.18215788011253717, + "loss": 0.0758, + "num_input_tokens_seen": 15602864, + "step": 17250 + }, + { + "epoch": 4.554045136597598, + "grad_norm": 0.0006564327632077038, + "learning_rate": 0.18210034236639935, + "loss": 0.0578, + "num_input_tokens_seen": 15607184, + "step": 17255 + }, + { + "epoch": 4.555364920153095, + "grad_norm": 0.0006818787660449743, + "learning_rate": 0.1820427996699853, + "loss": 0.053, + "num_input_tokens_seen": 15611440, + "step": 17260 + }, + { + "epoch": 4.556684703708592, + "grad_norm": 0.0012993229320272803, + "learning_rate": 0.1819852520321689, + "loss": 0.0421, + "num_input_tokens_seen": 15616080, + "step": 17265 + }, + { + "epoch": 4.558004487264089, + "grad_norm": 0.0038722532335668802, + "learning_rate": 0.18192769946182466, + "loss": 0.0864, + "num_input_tokens_seen": 15620816, + "step": 17270 + }, + { + "epoch": 4.559324270819586, + "grad_norm": 0.0026030431035906076, + "learning_rate": 0.18187014196782794, + "loss": 0.0898, + "num_input_tokens_seen": 15625584, + "step": 17275 + }, + { + "epoch": 4.560644054375082, + "grad_norm": 0.0024443031288683414, + "learning_rate": 0.18181257955905486, + "loss": 0.081, + "num_input_tokens_seen": 15630192, + "step": 17280 + }, + { + "epoch": 4.56196383793058, + "grad_norm": 0.002090043853968382, + "learning_rate": 0.18175501224438217, + "loss": 0.059, + "num_input_tokens_seen": 15634736, + "step": 17285 + }, + { + "epoch": 4.563283621486076, + "grad_norm": 0.0013812740799039602, + "learning_rate": 0.18169744003268756, + "loss": 0.1152, + "num_input_tokens_seen": 15639152, + "step": 17290 + }, + { + "epoch": 4.564603405041574, + "grad_norm": 0.0014691086253151298, + "learning_rate": 0.18163986293284937, + "loss": 0.0791, + "num_input_tokens_seen": 15643760, + "step": 17295 + }, + { + "epoch": 4.56592318859707, + "grad_norm": 0.0013774926774203777, + "learning_rate": 0.18158228095374673, + "loss": 0.0732, + "num_input_tokens_seen": 15648304, + "step": 17300 + }, + { + "epoch": 4.567242972152567, + "grad_norm": 0.001941782422363758, + "learning_rate": 0.18152469410425945, + "loss": 0.091, + "num_input_tokens_seen": 15652944, + "step": 17305 + }, + { + "epoch": 4.568562755708064, + "grad_norm": 0.0026795484591275454, + "learning_rate": 0.18146710239326813, + "loss": 0.0821, + "num_input_tokens_seen": 15657360, + "step": 17310 + }, + { + "epoch": 4.569882539263561, + "grad_norm": 0.0007263331790454686, + "learning_rate": 0.18140950582965423, + "loss": 0.079, + "num_input_tokens_seen": 15662128, + "step": 17315 + }, + { + "epoch": 4.571202322819058, + "grad_norm": 0.0029549964237958193, + "learning_rate": 0.1813519044222998, + "loss": 0.0568, + "num_input_tokens_seen": 15666352, + "step": 17320 + }, + { + "epoch": 4.572522106374555, + "grad_norm": 0.0015285544795915484, + "learning_rate": 0.18129429818008772, + "loss": 0.0671, + "num_input_tokens_seen": 15670864, + "step": 17325 + }, + { + "epoch": 4.573841889930051, + "grad_norm": 0.0023639199789613485, + "learning_rate": 0.18123668711190163, + "loss": 0.0744, + "num_input_tokens_seen": 15675504, + "step": 17330 + }, + { + "epoch": 4.575161673485549, + "grad_norm": 0.0013981316005811095, + "learning_rate": 0.18117907122662583, + "loss": 0.1088, + "num_input_tokens_seen": 15679952, + "step": 17335 + }, + { + "epoch": 4.576481457041045, + "grad_norm": 0.001590773812495172, + "learning_rate": 0.1811214505331454, + "loss": 0.0715, + "num_input_tokens_seen": 15684208, + "step": 17340 + }, + { + "epoch": 4.577801240596542, + "grad_norm": 0.0017007759306579828, + "learning_rate": 0.1810638250403462, + "loss": 0.0754, + "num_input_tokens_seen": 15688560, + "step": 17345 + }, + { + "epoch": 4.579121024152039, + "grad_norm": 0.0029063066467642784, + "learning_rate": 0.1810061947571148, + "loss": 0.0819, + "num_input_tokens_seen": 15693008, + "step": 17350 + }, + { + "epoch": 4.580440807707536, + "grad_norm": 0.0030616663862019777, + "learning_rate": 0.1809485596923385, + "loss": 0.0894, + "num_input_tokens_seen": 15697488, + "step": 17355 + }, + { + "epoch": 4.581760591263032, + "grad_norm": 0.002183089731261134, + "learning_rate": 0.18089091985490546, + "loss": 0.0806, + "num_input_tokens_seen": 15701808, + "step": 17360 + }, + { + "epoch": 4.58308037481853, + "grad_norm": 0.002681178506463766, + "learning_rate": 0.18083327525370432, + "loss": 0.0707, + "num_input_tokens_seen": 15706512, + "step": 17365 + }, + { + "epoch": 4.584400158374026, + "grad_norm": 0.001451960881240666, + "learning_rate": 0.18077562589762464, + "loss": 0.0628, + "num_input_tokens_seen": 15710928, + "step": 17370 + }, + { + "epoch": 4.585719941929524, + "grad_norm": 0.002561755245551467, + "learning_rate": 0.1807179717955567, + "loss": 0.0621, + "num_input_tokens_seen": 15715280, + "step": 17375 + }, + { + "epoch": 4.58703972548502, + "grad_norm": 0.000848734169267118, + "learning_rate": 0.1806603129563915, + "loss": 0.0685, + "num_input_tokens_seen": 15720176, + "step": 17380 + }, + { + "epoch": 4.588359509040517, + "grad_norm": 0.001787328626960516, + "learning_rate": 0.1806026493890208, + "loss": 0.0888, + "num_input_tokens_seen": 15724560, + "step": 17385 + }, + { + "epoch": 4.589679292596014, + "grad_norm": 0.0025409699883311987, + "learning_rate": 0.18054498110233688, + "loss": 0.0725, + "num_input_tokens_seen": 15729072, + "step": 17390 + }, + { + "epoch": 4.590999076151511, + "grad_norm": 0.0029048353899270296, + "learning_rate": 0.1804873081052331, + "loss": 0.1088, + "num_input_tokens_seen": 15733840, + "step": 17395 + }, + { + "epoch": 4.592318859707008, + "grad_norm": 0.0009727536235004663, + "learning_rate": 0.18042963040660326, + "loss": 0.0601, + "num_input_tokens_seen": 15738448, + "step": 17400 + }, + { + "epoch": 4.592318859707008, + "eval_loss": 0.09989480674266815, + "eval_runtime": 76.0021, + "eval_samples_per_second": 88.616, + "eval_steps_per_second": 22.157, + "num_input_tokens_seen": 15738448, + "step": 17400 + }, + { + "epoch": 4.593638643262505, + "grad_norm": 0.002966187195852399, + "learning_rate": 0.180371948015342, + "loss": 0.0711, + "num_input_tokens_seen": 15743120, + "step": 17405 + }, + { + "epoch": 4.5949584268180015, + "grad_norm": 0.0018302288372069597, + "learning_rate": 0.18031426094034472, + "loss": 0.0478, + "num_input_tokens_seen": 15747536, + "step": 17410 + }, + { + "epoch": 4.596278210373499, + "grad_norm": 0.0015903501771390438, + "learning_rate": 0.18025656919050737, + "loss": 0.072, + "num_input_tokens_seen": 15752144, + "step": 17415 + }, + { + "epoch": 4.5975979939289955, + "grad_norm": 0.0016843833727762103, + "learning_rate": 0.18019887277472688, + "loss": 0.1147, + "num_input_tokens_seen": 15756496, + "step": 17420 + }, + { + "epoch": 4.598917777484493, + "grad_norm": 0.0007426468073390424, + "learning_rate": 0.18014117170190067, + "loss": 0.0718, + "num_input_tokens_seen": 15760912, + "step": 17425 + }, + { + "epoch": 4.6002375610399895, + "grad_norm": 0.00285494327545166, + "learning_rate": 0.18008346598092703, + "loss": 0.0797, + "num_input_tokens_seen": 15766000, + "step": 17430 + }, + { + "epoch": 4.601557344595486, + "grad_norm": 0.001359619782306254, + "learning_rate": 0.18002575562070489, + "loss": 0.0319, + "num_input_tokens_seen": 15770448, + "step": 17435 + }, + { + "epoch": 4.6028771281509835, + "grad_norm": 0.002094987314194441, + "learning_rate": 0.1799680406301339, + "loss": 0.0931, + "num_input_tokens_seen": 15774736, + "step": 17440 + }, + { + "epoch": 4.60419691170648, + "grad_norm": 0.0032751881517469883, + "learning_rate": 0.17991032101811447, + "loss": 0.1001, + "num_input_tokens_seen": 15779312, + "step": 17445 + }, + { + "epoch": 4.6055166952619775, + "grad_norm": 0.0016887730453163385, + "learning_rate": 0.1798525967935476, + "loss": 0.0604, + "num_input_tokens_seen": 15783600, + "step": 17450 + }, + { + "epoch": 4.606836478817474, + "grad_norm": 0.0021255824249237776, + "learning_rate": 0.17979486796533517, + "loss": 0.1176, + "num_input_tokens_seen": 15787984, + "step": 17455 + }, + { + "epoch": 4.608156262372971, + "grad_norm": 0.0015582469059154391, + "learning_rate": 0.1797371345423797, + "loss": 0.0847, + "num_input_tokens_seen": 15792560, + "step": 17460 + }, + { + "epoch": 4.609476045928468, + "grad_norm": 0.001912951935082674, + "learning_rate": 0.17967939653358436, + "loss": 0.0644, + "num_input_tokens_seen": 15797136, + "step": 17465 + }, + { + "epoch": 4.610795829483965, + "grad_norm": 0.0017248039366677403, + "learning_rate": 0.17962165394785315, + "loss": 0.0409, + "num_input_tokens_seen": 15801584, + "step": 17470 + }, + { + "epoch": 4.612115613039461, + "grad_norm": 0.002759300172328949, + "learning_rate": 0.17956390679409057, + "loss": 0.1232, + "num_input_tokens_seen": 15806064, + "step": 17475 + }, + { + "epoch": 4.613435396594959, + "grad_norm": 0.0026244064792990685, + "learning_rate": 0.1795061550812021, + "loss": 0.0881, + "num_input_tokens_seen": 15810576, + "step": 17480 + }, + { + "epoch": 4.614755180150455, + "grad_norm": 0.0011013540206477046, + "learning_rate": 0.1794483988180937, + "loss": 0.0798, + "num_input_tokens_seen": 15815120, + "step": 17485 + }, + { + "epoch": 4.616074963705953, + "grad_norm": 0.002310894662514329, + "learning_rate": 0.17939063801367214, + "loss": 0.1208, + "num_input_tokens_seen": 15819792, + "step": 17490 + }, + { + "epoch": 4.617394747261449, + "grad_norm": 0.0030531275551766157, + "learning_rate": 0.17933287267684483, + "loss": 0.079, + "num_input_tokens_seen": 15824208, + "step": 17495 + }, + { + "epoch": 4.618714530816946, + "grad_norm": 0.0021063764579594135, + "learning_rate": 0.17927510281651995, + "loss": 0.0525, + "num_input_tokens_seen": 15828976, + "step": 17500 + }, + { + "epoch": 4.620034314372443, + "grad_norm": 0.0009976894361898303, + "learning_rate": 0.17921732844160634, + "loss": 0.0894, + "num_input_tokens_seen": 15833296, + "step": 17505 + }, + { + "epoch": 4.62135409792794, + "grad_norm": 0.0028240287210792303, + "learning_rate": 0.17915954956101351, + "loss": 0.0771, + "num_input_tokens_seen": 15837840, + "step": 17510 + }, + { + "epoch": 4.622673881483436, + "grad_norm": 0.0023958745878189802, + "learning_rate": 0.17910176618365165, + "loss": 0.0801, + "num_input_tokens_seen": 15842544, + "step": 17515 + }, + { + "epoch": 4.623993665038934, + "grad_norm": 0.002998670330271125, + "learning_rate": 0.17904397831843177, + "loss": 0.0781, + "num_input_tokens_seen": 15847248, + "step": 17520 + }, + { + "epoch": 4.62531344859443, + "grad_norm": 0.00272256787866354, + "learning_rate": 0.17898618597426547, + "loss": 0.1013, + "num_input_tokens_seen": 15851568, + "step": 17525 + }, + { + "epoch": 4.626633232149928, + "grad_norm": 0.0015995557187125087, + "learning_rate": 0.17892838916006495, + "loss": 0.0708, + "num_input_tokens_seen": 15856272, + "step": 17530 + }, + { + "epoch": 4.627953015705424, + "grad_norm": 0.0029379057232290506, + "learning_rate": 0.17887058788474333, + "loss": 0.0698, + "num_input_tokens_seen": 15860816, + "step": 17535 + }, + { + "epoch": 4.629272799260921, + "grad_norm": 0.0011855366174131632, + "learning_rate": 0.17881278215721427, + "loss": 0.0779, + "num_input_tokens_seen": 15865232, + "step": 17540 + }, + { + "epoch": 4.630592582816418, + "grad_norm": 0.002552383579313755, + "learning_rate": 0.1787549719863921, + "loss": 0.0753, + "num_input_tokens_seen": 15869872, + "step": 17545 + }, + { + "epoch": 4.631912366371915, + "grad_norm": 0.0035990015603601933, + "learning_rate": 0.17869715738119188, + "loss": 0.0576, + "num_input_tokens_seen": 15874736, + "step": 17550 + }, + { + "epoch": 4.633232149927412, + "grad_norm": 0.0017518443055450916, + "learning_rate": 0.17863933835052936, + "loss": 0.0977, + "num_input_tokens_seen": 15879216, + "step": 17555 + }, + { + "epoch": 4.634551933482909, + "grad_norm": 0.0018878639675676823, + "learning_rate": 0.17858151490332097, + "loss": 0.0521, + "num_input_tokens_seen": 15883344, + "step": 17560 + }, + { + "epoch": 4.635871717038405, + "grad_norm": 0.002660009078681469, + "learning_rate": 0.17852368704848381, + "loss": 0.0737, + "num_input_tokens_seen": 15887920, + "step": 17565 + }, + { + "epoch": 4.637191500593903, + "grad_norm": 0.0032761564943939447, + "learning_rate": 0.17846585479493565, + "loss": 0.0449, + "num_input_tokens_seen": 15892528, + "step": 17570 + }, + { + "epoch": 4.638511284149399, + "grad_norm": 0.002229034435003996, + "learning_rate": 0.178408018151595, + "loss": 0.0641, + "num_input_tokens_seen": 15897104, + "step": 17575 + }, + { + "epoch": 4.639831067704897, + "grad_norm": 0.0038696553092449903, + "learning_rate": 0.17835017712738085, + "loss": 0.0991, + "num_input_tokens_seen": 15901744, + "step": 17580 + }, + { + "epoch": 4.641150851260393, + "grad_norm": 0.0031302799470722675, + "learning_rate": 0.17829233173121323, + "loss": 0.1102, + "num_input_tokens_seen": 15906384, + "step": 17585 + }, + { + "epoch": 4.64247063481589, + "grad_norm": 0.00221181008964777, + "learning_rate": 0.17823448197201244, + "loss": 0.0766, + "num_input_tokens_seen": 15910928, + "step": 17590 + }, + { + "epoch": 4.643790418371387, + "grad_norm": 0.0019807193893939257, + "learning_rate": 0.1781766278586997, + "loss": 0.0876, + "num_input_tokens_seen": 15915376, + "step": 17595 + }, + { + "epoch": 4.645110201926884, + "grad_norm": 0.002395015675574541, + "learning_rate": 0.1781187694001969, + "loss": 0.0972, + "num_input_tokens_seen": 15919856, + "step": 17600 + }, + { + "epoch": 4.645110201926884, + "eval_loss": 0.0918474867939949, + "eval_runtime": 75.9705, + "eval_samples_per_second": 88.653, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 15919856, + "step": 17600 + }, + { + "epoch": 4.6464299854823805, + "grad_norm": 0.0021354432683438063, + "learning_rate": 0.1780609066054265, + "loss": 0.0856, + "num_input_tokens_seen": 15924176, + "step": 17605 + }, + { + "epoch": 4.647749769037878, + "grad_norm": 0.002914890879765153, + "learning_rate": 0.17800303948331164, + "loss": 0.1309, + "num_input_tokens_seen": 15928720, + "step": 17610 + }, + { + "epoch": 4.6490695525933745, + "grad_norm": 0.0014143713051453233, + "learning_rate": 0.1779451680427762, + "loss": 0.0956, + "num_input_tokens_seen": 15933456, + "step": 17615 + }, + { + "epoch": 4.650389336148872, + "grad_norm": 0.0033772201277315617, + "learning_rate": 0.17788729229274464, + "loss": 0.0937, + "num_input_tokens_seen": 15938288, + "step": 17620 + }, + { + "epoch": 4.6517091197043685, + "grad_norm": 0.002178548602387309, + "learning_rate": 0.17782941224214222, + "loss": 0.0622, + "num_input_tokens_seen": 15942768, + "step": 17625 + }, + { + "epoch": 4.653028903259865, + "grad_norm": 0.00125894567463547, + "learning_rate": 0.17777152789989464, + "loss": 0.1014, + "num_input_tokens_seen": 15947248, + "step": 17630 + }, + { + "epoch": 4.6543486868153625, + "grad_norm": 0.0011518356623128057, + "learning_rate": 0.17771363927492845, + "loss": 0.0581, + "num_input_tokens_seen": 15951632, + "step": 17635 + }, + { + "epoch": 4.655668470370859, + "grad_norm": 0.001722709508612752, + "learning_rate": 0.17765574637617085, + "loss": 0.0723, + "num_input_tokens_seen": 15955856, + "step": 17640 + }, + { + "epoch": 4.656988253926356, + "grad_norm": 0.002558761043474078, + "learning_rate": 0.17759784921254962, + "loss": 0.0786, + "num_input_tokens_seen": 15960272, + "step": 17645 + }, + { + "epoch": 4.658308037481853, + "grad_norm": 0.0017883385298773646, + "learning_rate": 0.1775399477929932, + "loss": 0.0743, + "num_input_tokens_seen": 15964624, + "step": 17650 + }, + { + "epoch": 4.65962782103735, + "grad_norm": 0.0021339852828532457, + "learning_rate": 0.17748204212643076, + "loss": 0.068, + "num_input_tokens_seen": 15969360, + "step": 17655 + }, + { + "epoch": 4.660947604592847, + "grad_norm": 0.0016567246057093143, + "learning_rate": 0.17742413222179204, + "loss": 0.0655, + "num_input_tokens_seen": 15973680, + "step": 17660 + }, + { + "epoch": 4.662267388148344, + "grad_norm": 0.0010465839877724648, + "learning_rate": 0.17736621808800754, + "loss": 0.0508, + "num_input_tokens_seen": 15978096, + "step": 17665 + }, + { + "epoch": 4.66358717170384, + "grad_norm": 0.003403707407414913, + "learning_rate": 0.17730829973400827, + "loss": 0.0974, + "num_input_tokens_seen": 15982416, + "step": 17670 + }, + { + "epoch": 4.664906955259338, + "grad_norm": 0.0011503822170197964, + "learning_rate": 0.17725037716872602, + "loss": 0.0731, + "num_input_tokens_seen": 15986896, + "step": 17675 + }, + { + "epoch": 4.666226738814834, + "grad_norm": 0.002784040989354253, + "learning_rate": 0.17719245040109313, + "loss": 0.0704, + "num_input_tokens_seen": 15991472, + "step": 17680 + }, + { + "epoch": 4.667546522370332, + "grad_norm": 0.00370439188554883, + "learning_rate": 0.17713451944004271, + "loss": 0.1106, + "num_input_tokens_seen": 15995984, + "step": 17685 + }, + { + "epoch": 4.668866305925828, + "grad_norm": 0.0018743695691227913, + "learning_rate": 0.17707658429450843, + "loss": 0.067, + "num_input_tokens_seen": 16000528, + "step": 17690 + }, + { + "epoch": 4.670186089481325, + "grad_norm": 0.002493085339665413, + "learning_rate": 0.1770186449734245, + "loss": 0.1008, + "num_input_tokens_seen": 16005008, + "step": 17695 + }, + { + "epoch": 4.671505873036822, + "grad_norm": 0.001196713768877089, + "learning_rate": 0.17696070148572599, + "loss": 0.0998, + "num_input_tokens_seen": 16009424, + "step": 17700 + }, + { + "epoch": 4.672825656592319, + "grad_norm": 0.0019909213297069073, + "learning_rate": 0.17690275384034856, + "loss": 0.0586, + "num_input_tokens_seen": 16014032, + "step": 17705 + }, + { + "epoch": 4.674145440147816, + "grad_norm": 0.0026577068492770195, + "learning_rate": 0.17684480204622835, + "loss": 0.0846, + "num_input_tokens_seen": 16018704, + "step": 17710 + }, + { + "epoch": 4.675465223703313, + "grad_norm": 0.0023822777438908815, + "learning_rate": 0.1767868461123023, + "loss": 0.0944, + "num_input_tokens_seen": 16023216, + "step": 17715 + }, + { + "epoch": 4.676785007258809, + "grad_norm": 0.0016283032018691301, + "learning_rate": 0.176728886047508, + "loss": 0.0829, + "num_input_tokens_seen": 16027824, + "step": 17720 + }, + { + "epoch": 4.678104790814307, + "grad_norm": 0.0013135474873706698, + "learning_rate": 0.17667092186078362, + "loss": 0.063, + "num_input_tokens_seen": 16032496, + "step": 17725 + }, + { + "epoch": 4.679424574369803, + "grad_norm": 0.0026227368507534266, + "learning_rate": 0.17661295356106785, + "loss": 0.0825, + "num_input_tokens_seen": 16036816, + "step": 17730 + }, + { + "epoch": 4.680744357925301, + "grad_norm": 0.0007876810850575566, + "learning_rate": 0.1765549811573002, + "loss": 0.0704, + "num_input_tokens_seen": 16041168, + "step": 17735 + }, + { + "epoch": 4.682064141480797, + "grad_norm": 0.002515863860026002, + "learning_rate": 0.17649700465842078, + "loss": 0.0751, + "num_input_tokens_seen": 16045616, + "step": 17740 + }, + { + "epoch": 4.683383925036294, + "grad_norm": 0.002859212225303054, + "learning_rate": 0.17643902407337023, + "loss": 0.0857, + "num_input_tokens_seen": 16050128, + "step": 17745 + }, + { + "epoch": 4.684703708591791, + "grad_norm": 0.0026012409944087267, + "learning_rate": 0.17638103941108993, + "loss": 0.0691, + "num_input_tokens_seen": 16054576, + "step": 17750 + }, + { + "epoch": 4.686023492147288, + "grad_norm": 0.0029831193387508392, + "learning_rate": 0.1763230506805218, + "loss": 0.1045, + "num_input_tokens_seen": 16058960, + "step": 17755 + }, + { + "epoch": 4.687343275702784, + "grad_norm": 0.0011041118996217847, + "learning_rate": 0.1762650578906085, + "loss": 0.0863, + "num_input_tokens_seen": 16063312, + "step": 17760 + }, + { + "epoch": 4.688663059258282, + "grad_norm": 0.0010338060092180967, + "learning_rate": 0.1762070610502932, + "loss": 0.0746, + "num_input_tokens_seen": 16067792, + "step": 17765 + }, + { + "epoch": 4.689982842813778, + "grad_norm": 0.0018870839849114418, + "learning_rate": 0.17614906016851975, + "loss": 0.0851, + "num_input_tokens_seen": 16072656, + "step": 17770 + }, + { + "epoch": 4.691302626369275, + "grad_norm": 0.0016552414745092392, + "learning_rate": 0.17609105525423258, + "loss": 0.0846, + "num_input_tokens_seen": 16077104, + "step": 17775 + }, + { + "epoch": 4.692622409924772, + "grad_norm": 0.001580483396537602, + "learning_rate": 0.1760330463163768, + "loss": 0.0523, + "num_input_tokens_seen": 16082000, + "step": 17780 + }, + { + "epoch": 4.693942193480269, + "grad_norm": 0.0022257529199123383, + "learning_rate": 0.17597503336389816, + "loss": 0.0608, + "num_input_tokens_seen": 16086448, + "step": 17785 + }, + { + "epoch": 4.695261977035766, + "grad_norm": 0.001937027438543737, + "learning_rate": 0.17591701640574298, + "loss": 0.0774, + "num_input_tokens_seen": 16091056, + "step": 17790 + }, + { + "epoch": 4.696581760591263, + "grad_norm": 0.0011337425094097853, + "learning_rate": 0.17585899545085815, + "loss": 0.0737, + "num_input_tokens_seen": 16095568, + "step": 17795 + }, + { + "epoch": 4.6979015441467595, + "grad_norm": 0.0027981274761259556, + "learning_rate": 0.17580097050819124, + "loss": 0.1025, + "num_input_tokens_seen": 16100016, + "step": 17800 + }, + { + "epoch": 4.6979015441467595, + "eval_loss": 0.09721826761960983, + "eval_runtime": 75.8766, + "eval_samples_per_second": 88.762, + "eval_steps_per_second": 22.194, + "num_input_tokens_seen": 16100016, + "step": 17800 + }, + { + "epoch": 4.699221327702257, + "grad_norm": 0.00410734536126256, + "learning_rate": 0.17574294158669046, + "loss": 0.0857, + "num_input_tokens_seen": 16104592, + "step": 17805 + }, + { + "epoch": 4.7005411112577535, + "grad_norm": 0.005809989757835865, + "learning_rate": 0.17568490869530456, + "loss": 0.0985, + "num_input_tokens_seen": 16108976, + "step": 17810 + }, + { + "epoch": 4.701860894813251, + "grad_norm": 0.0012003404553979635, + "learning_rate": 0.17562687184298295, + "loss": 0.1027, + "num_input_tokens_seen": 16113392, + "step": 17815 + }, + { + "epoch": 4.7031806783687475, + "grad_norm": 0.002431460889056325, + "learning_rate": 0.1755688310386757, + "loss": 0.0596, + "num_input_tokens_seen": 16118256, + "step": 17820 + }, + { + "epoch": 4.704500461924244, + "grad_norm": 0.0015232126461341977, + "learning_rate": 0.17551078629133335, + "loss": 0.0926, + "num_input_tokens_seen": 16122928, + "step": 17825 + }, + { + "epoch": 4.7058202454797415, + "grad_norm": 0.0016208095476031303, + "learning_rate": 0.17545273760990718, + "loss": 0.0551, + "num_input_tokens_seen": 16127280, + "step": 17830 + }, + { + "epoch": 4.707140029035238, + "grad_norm": 0.002203751588240266, + "learning_rate": 0.17539468500334904, + "loss": 0.0802, + "num_input_tokens_seen": 16131792, + "step": 17835 + }, + { + "epoch": 4.7084598125907355, + "grad_norm": 0.0020512964110821486, + "learning_rate": 0.17533662848061132, + "loss": 0.057, + "num_input_tokens_seen": 16136144, + "step": 17840 + }, + { + "epoch": 4.709779596146232, + "grad_norm": 0.0024935854598879814, + "learning_rate": 0.1752785680506471, + "loss": 0.1081, + "num_input_tokens_seen": 16140432, + "step": 17845 + }, + { + "epoch": 4.711099379701729, + "grad_norm": 0.0018050509970635176, + "learning_rate": 0.17522050372241, + "loss": 0.0697, + "num_input_tokens_seen": 16144720, + "step": 17850 + }, + { + "epoch": 4.712419163257226, + "grad_norm": 0.0011764898663386703, + "learning_rate": 0.17516243550485425, + "loss": 0.0495, + "num_input_tokens_seen": 16148912, + "step": 17855 + }, + { + "epoch": 4.713738946812723, + "grad_norm": 0.002110353671014309, + "learning_rate": 0.17510436340693478, + "loss": 0.0827, + "num_input_tokens_seen": 16153680, + "step": 17860 + }, + { + "epoch": 4.71505873036822, + "grad_norm": 0.001842002966441214, + "learning_rate": 0.175046287437607, + "loss": 0.1061, + "num_input_tokens_seen": 16158224, + "step": 17865 + }, + { + "epoch": 4.716378513923717, + "grad_norm": 0.0019439425086602569, + "learning_rate": 0.17498820760582695, + "loss": 0.0694, + "num_input_tokens_seen": 16162544, + "step": 17870 + }, + { + "epoch": 4.717698297479213, + "grad_norm": 0.00180989527143538, + "learning_rate": 0.1749301239205512, + "loss": 0.0549, + "num_input_tokens_seen": 16167280, + "step": 17875 + }, + { + "epoch": 4.719018081034711, + "grad_norm": 0.0014475563075393438, + "learning_rate": 0.1748720363907371, + "loss": 0.051, + "num_input_tokens_seen": 16171664, + "step": 17880 + }, + { + "epoch": 4.720337864590207, + "grad_norm": 0.00266922521404922, + "learning_rate": 0.17481394502534242, + "loss": 0.0688, + "num_input_tokens_seen": 16176592, + "step": 17885 + }, + { + "epoch": 4.721657648145704, + "grad_norm": 0.002714933827519417, + "learning_rate": 0.17475584983332562, + "loss": 0.0824, + "num_input_tokens_seen": 16180976, + "step": 17890 + }, + { + "epoch": 4.722977431701201, + "grad_norm": 0.0007122533861547709, + "learning_rate": 0.17469775082364558, + "loss": 0.0639, + "num_input_tokens_seen": 16185488, + "step": 17895 + }, + { + "epoch": 4.724297215256698, + "grad_norm": 0.0011735883308574557, + "learning_rate": 0.17463964800526205, + "loss": 0.056, + "num_input_tokens_seen": 16190192, + "step": 17900 + }, + { + "epoch": 4.725616998812194, + "grad_norm": 0.003266391111537814, + "learning_rate": 0.17458154138713522, + "loss": 0.1036, + "num_input_tokens_seen": 16194704, + "step": 17905 + }, + { + "epoch": 4.726936782367692, + "grad_norm": 0.0018520549638196826, + "learning_rate": 0.17452343097822576, + "loss": 0.1148, + "num_input_tokens_seen": 16199056, + "step": 17910 + }, + { + "epoch": 4.728256565923188, + "grad_norm": 0.0020546335726976395, + "learning_rate": 0.17446531678749497, + "loss": 0.1053, + "num_input_tokens_seen": 16203440, + "step": 17915 + }, + { + "epoch": 4.729576349478686, + "grad_norm": 0.002214714651927352, + "learning_rate": 0.17440719882390496, + "loss": 0.0577, + "num_input_tokens_seen": 16207984, + "step": 17920 + }, + { + "epoch": 4.730896133034182, + "grad_norm": 0.0012403931468725204, + "learning_rate": 0.17434907709641814, + "loss": 0.0625, + "num_input_tokens_seen": 16212944, + "step": 17925 + }, + { + "epoch": 4.732215916589679, + "grad_norm": 0.004662972874939442, + "learning_rate": 0.17429095161399769, + "loss": 0.1208, + "num_input_tokens_seen": 16217456, + "step": 17930 + }, + { + "epoch": 4.733535700145176, + "grad_norm": 0.0022338470444083214, + "learning_rate": 0.1742328223856072, + "loss": 0.0627, + "num_input_tokens_seen": 16222128, + "step": 17935 + }, + { + "epoch": 4.734855483700673, + "grad_norm": 0.002706049708649516, + "learning_rate": 0.174174689420211, + "loss": 0.0517, + "num_input_tokens_seen": 16226864, + "step": 17940 + }, + { + "epoch": 4.73617526725617, + "grad_norm": 0.002376090968027711, + "learning_rate": 0.1741165527267739, + "loss": 0.0622, + "num_input_tokens_seen": 16231344, + "step": 17945 + }, + { + "epoch": 4.737495050811667, + "grad_norm": 0.003476453712210059, + "learning_rate": 0.17405841231426125, + "loss": 0.0618, + "num_input_tokens_seen": 16236048, + "step": 17950 + }, + { + "epoch": 4.738814834367163, + "grad_norm": 0.002502476330846548, + "learning_rate": 0.1740002681916391, + "loss": 0.0982, + "num_input_tokens_seen": 16240752, + "step": 17955 + }, + { + "epoch": 4.740134617922661, + "grad_norm": 0.00293655670247972, + "learning_rate": 0.17394212036787401, + "loss": 0.1067, + "num_input_tokens_seen": 16245328, + "step": 17960 + }, + { + "epoch": 4.741454401478157, + "grad_norm": 0.0024879509583115578, + "learning_rate": 0.1738839688519331, + "loss": 0.0801, + "num_input_tokens_seen": 16249936, + "step": 17965 + }, + { + "epoch": 4.742774185033655, + "grad_norm": 0.000962028163485229, + "learning_rate": 0.17382581365278402, + "loss": 0.0659, + "num_input_tokens_seen": 16254832, + "step": 17970 + }, + { + "epoch": 4.7440939685891514, + "grad_norm": 0.003323270007967949, + "learning_rate": 0.17376765477939507, + "loss": 0.1001, + "num_input_tokens_seen": 16259440, + "step": 17975 + }, + { + "epoch": 4.745413752144648, + "grad_norm": 0.0006588593241758645, + "learning_rate": 0.1737094922407351, + "loss": 0.0504, + "num_input_tokens_seen": 16264144, + "step": 17980 + }, + { + "epoch": 4.7467335357001454, + "grad_norm": 0.0018257732735946774, + "learning_rate": 0.1736513260457734, + "loss": 0.0847, + "num_input_tokens_seen": 16268784, + "step": 17985 + }, + { + "epoch": 4.748053319255642, + "grad_norm": 0.0012416379759088159, + "learning_rate": 0.17359315620348006, + "loss": 0.091, + "num_input_tokens_seen": 16273264, + "step": 17990 + }, + { + "epoch": 4.7493731028111394, + "grad_norm": 0.003252837574109435, + "learning_rate": 0.17353498272282547, + "loss": 0.0816, + "num_input_tokens_seen": 16277744, + "step": 17995 + }, + { + "epoch": 4.750692886366636, + "grad_norm": 0.002274246420711279, + "learning_rate": 0.17347680561278087, + "loss": 0.1033, + "num_input_tokens_seen": 16282288, + "step": 18000 + }, + { + "epoch": 4.750692886366636, + "eval_loss": 0.09273525327444077, + "eval_runtime": 75.9774, + "eval_samples_per_second": 88.645, + "eval_steps_per_second": 22.164, + "num_input_tokens_seen": 16282288, + "step": 18000 + }, + { + "epoch": 4.752012669922133, + "grad_norm": 0.0007206624140962958, + "learning_rate": 0.1734186248823178, + "loss": 0.0752, + "num_input_tokens_seen": 16286768, + "step": 18005 + }, + { + "epoch": 4.75333245347763, + "grad_norm": 0.0013069409178569913, + "learning_rate": 0.17336044054040844, + "loss": 0.063, + "num_input_tokens_seen": 16291024, + "step": 18010 + }, + { + "epoch": 4.754652237033127, + "grad_norm": 0.001207399764098227, + "learning_rate": 0.1733022525960256, + "loss": 0.0695, + "num_input_tokens_seen": 16295408, + "step": 18015 + }, + { + "epoch": 4.755972020588623, + "grad_norm": 0.0026103004347532988, + "learning_rate": 0.1732440610581426, + "loss": 0.075, + "num_input_tokens_seen": 16299600, + "step": 18020 + }, + { + "epoch": 4.757291804144121, + "grad_norm": 0.000873565033543855, + "learning_rate": 0.17318586593573326, + "loss": 0.0767, + "num_input_tokens_seen": 16304464, + "step": 18025 + }, + { + "epoch": 4.758611587699617, + "grad_norm": 0.0009229430579580367, + "learning_rate": 0.17312766723777204, + "loss": 0.0916, + "num_input_tokens_seen": 16308944, + "step": 18030 + }, + { + "epoch": 4.759931371255114, + "grad_norm": 0.0015196563908830285, + "learning_rate": 0.1730694649732339, + "loss": 0.0803, + "num_input_tokens_seen": 16313680, + "step": 18035 + }, + { + "epoch": 4.761251154810611, + "grad_norm": 0.0011459797387942672, + "learning_rate": 0.17301125915109428, + "loss": 0.0817, + "num_input_tokens_seen": 16318256, + "step": 18040 + }, + { + "epoch": 4.762570938366108, + "grad_norm": 0.0007054831366986036, + "learning_rate": 0.17295304978032938, + "loss": 0.0679, + "num_input_tokens_seen": 16322672, + "step": 18045 + }, + { + "epoch": 4.763890721921605, + "grad_norm": 0.0012885661562904716, + "learning_rate": 0.17289483686991577, + "loss": 0.0882, + "num_input_tokens_seen": 16327120, + "step": 18050 + }, + { + "epoch": 4.765210505477102, + "grad_norm": 0.0015101423487067223, + "learning_rate": 0.1728366204288306, + "loss": 0.0787, + "num_input_tokens_seen": 16331568, + "step": 18055 + }, + { + "epoch": 4.766530289032598, + "grad_norm": 0.003513315226882696, + "learning_rate": 0.17277840046605153, + "loss": 0.0931, + "num_input_tokens_seen": 16335728, + "step": 18060 + }, + { + "epoch": 4.767850072588096, + "grad_norm": 0.0008560552960261703, + "learning_rate": 0.17272017699055686, + "loss": 0.0785, + "num_input_tokens_seen": 16340432, + "step": 18065 + }, + { + "epoch": 4.769169856143592, + "grad_norm": 0.003030703403055668, + "learning_rate": 0.17266195001132542, + "loss": 0.0939, + "num_input_tokens_seen": 16345008, + "step": 18070 + }, + { + "epoch": 4.77048963969909, + "grad_norm": 0.001868229592218995, + "learning_rate": 0.17260371953733647, + "loss": 0.0827, + "num_input_tokens_seen": 16349232, + "step": 18075 + }, + { + "epoch": 4.771809423254586, + "grad_norm": 0.0022161570377647877, + "learning_rate": 0.1725454855775699, + "loss": 0.0707, + "num_input_tokens_seen": 16353872, + "step": 18080 + }, + { + "epoch": 4.773129206810083, + "grad_norm": 0.0013076981995254755, + "learning_rate": 0.17248724814100616, + "loss": 0.0597, + "num_input_tokens_seen": 16358096, + "step": 18085 + }, + { + "epoch": 4.77444899036558, + "grad_norm": 0.001361354603432119, + "learning_rate": 0.17242900723662619, + "loss": 0.0868, + "num_input_tokens_seen": 16362320, + "step": 18090 + }, + { + "epoch": 4.775768773921077, + "grad_norm": 0.0015439213020727038, + "learning_rate": 0.1723707628734114, + "loss": 0.0882, + "num_input_tokens_seen": 16366480, + "step": 18095 + }, + { + "epoch": 4.777088557476574, + "grad_norm": 0.0031992108561098576, + "learning_rate": 0.1723125150603438, + "loss": 0.1256, + "num_input_tokens_seen": 16371184, + "step": 18100 + }, + { + "epoch": 4.778408341032071, + "grad_norm": 0.001053566811606288, + "learning_rate": 0.1722542638064061, + "loss": 0.0589, + "num_input_tokens_seen": 16375696, + "step": 18105 + }, + { + "epoch": 4.779728124587567, + "grad_norm": 0.001423432375304401, + "learning_rate": 0.17219600912058117, + "loss": 0.0996, + "num_input_tokens_seen": 16380464, + "step": 18110 + }, + { + "epoch": 4.781047908143065, + "grad_norm": 0.0010167533764615655, + "learning_rate": 0.17213775101185272, + "loss": 0.0539, + "num_input_tokens_seen": 16385040, + "step": 18115 + }, + { + "epoch": 4.782367691698561, + "grad_norm": 0.002246815711259842, + "learning_rate": 0.17207948948920485, + "loss": 0.0779, + "num_input_tokens_seen": 16389488, + "step": 18120 + }, + { + "epoch": 4.783687475254059, + "grad_norm": 0.0017931776819750667, + "learning_rate": 0.17202122456162228, + "loss": 0.0531, + "num_input_tokens_seen": 16394192, + "step": 18125 + }, + { + "epoch": 4.785007258809555, + "grad_norm": 0.002242939081043005, + "learning_rate": 0.17196295623809013, + "loss": 0.0459, + "num_input_tokens_seen": 16398640, + "step": 18130 + }, + { + "epoch": 4.786327042365052, + "grad_norm": 0.0025334625970572233, + "learning_rate": 0.1719046845275941, + "loss": 0.0826, + "num_input_tokens_seen": 16403120, + "step": 18135 + }, + { + "epoch": 4.787646825920549, + "grad_norm": 0.0022691935300827026, + "learning_rate": 0.17184640943912044, + "loss": 0.0891, + "num_input_tokens_seen": 16407760, + "step": 18140 + }, + { + "epoch": 4.788966609476046, + "grad_norm": 0.0028382372111082077, + "learning_rate": 0.1717881309816559, + "loss": 0.0754, + "num_input_tokens_seen": 16412336, + "step": 18145 + }, + { + "epoch": 4.7902863930315425, + "grad_norm": 0.003000708995386958, + "learning_rate": 0.1717298491641878, + "loss": 0.1156, + "num_input_tokens_seen": 16416944, + "step": 18150 + }, + { + "epoch": 4.79160617658704, + "grad_norm": 0.0013697176473215222, + "learning_rate": 0.17167156399570385, + "loss": 0.0786, + "num_input_tokens_seen": 16421328, + "step": 18155 + }, + { + "epoch": 4.7929259601425365, + "grad_norm": 0.0020095198415219784, + "learning_rate": 0.17161327548519242, + "loss": 0.0654, + "num_input_tokens_seen": 16425904, + "step": 18160 + }, + { + "epoch": 4.794245743698033, + "grad_norm": 0.001984406029805541, + "learning_rate": 0.1715549836416423, + "loss": 0.0806, + "num_input_tokens_seen": 16430480, + "step": 18165 + }, + { + "epoch": 4.7955655272535305, + "grad_norm": 0.0026865473482757807, + "learning_rate": 0.17149668847404279, + "loss": 0.0863, + "num_input_tokens_seen": 16434928, + "step": 18170 + }, + { + "epoch": 4.796885310809027, + "grad_norm": 0.0014872385654598475, + "learning_rate": 0.1714383899913838, + "loss": 0.0819, + "num_input_tokens_seen": 16439248, + "step": 18175 + }, + { + "epoch": 4.7982050943645245, + "grad_norm": 0.0018638261826708913, + "learning_rate": 0.17138008820265563, + "loss": 0.0892, + "num_input_tokens_seen": 16443696, + "step": 18180 + }, + { + "epoch": 4.799524877920021, + "grad_norm": 0.0019209478050470352, + "learning_rate": 0.17132178311684917, + "loss": 0.069, + "num_input_tokens_seen": 16448048, + "step": 18185 + }, + { + "epoch": 4.800844661475518, + "grad_norm": 0.0012615957530215383, + "learning_rate": 0.1712634747429559, + "loss": 0.0591, + "num_input_tokens_seen": 16452560, + "step": 18190 + }, + { + "epoch": 4.802164445031015, + "grad_norm": 0.0030704112723469734, + "learning_rate": 0.17120516308996753, + "loss": 0.1072, + "num_input_tokens_seen": 16457072, + "step": 18195 + }, + { + "epoch": 4.803484228586512, + "grad_norm": 0.00292596616782248, + "learning_rate": 0.17114684816687653, + "loss": 0.0942, + "num_input_tokens_seen": 16461520, + "step": 18200 + }, + { + "epoch": 4.803484228586512, + "eval_loss": 0.08939731121063232, + "eval_runtime": 75.9395, + "eval_samples_per_second": 88.689, + "eval_steps_per_second": 22.176, + "num_input_tokens_seen": 16461520, + "step": 18200 + }, + { + "epoch": 4.804804012142009, + "grad_norm": 0.0005812290473841131, + "learning_rate": 0.17108852998267585, + "loss": 0.0361, + "num_input_tokens_seen": 16466064, + "step": 18205 + }, + { + "epoch": 4.806123795697506, + "grad_norm": 0.0008868644945323467, + "learning_rate": 0.17103020854635878, + "loss": 0.0717, + "num_input_tokens_seen": 16470640, + "step": 18210 + }, + { + "epoch": 4.807443579253002, + "grad_norm": 0.0005160425207577646, + "learning_rate": 0.1709718838669193, + "loss": 0.0652, + "num_input_tokens_seen": 16475024, + "step": 18215 + }, + { + "epoch": 4.8087633628085, + "grad_norm": 0.002514322753995657, + "learning_rate": 0.17091355595335173, + "loss": 0.092, + "num_input_tokens_seen": 16479760, + "step": 18220 + }, + { + "epoch": 4.810083146363996, + "grad_norm": 0.0015279342187568545, + "learning_rate": 0.17085522481465107, + "loss": 0.0736, + "num_input_tokens_seen": 16484368, + "step": 18225 + }, + { + "epoch": 4.811402929919494, + "grad_norm": 0.002036913065239787, + "learning_rate": 0.17079689045981264, + "loss": 0.0841, + "num_input_tokens_seen": 16488784, + "step": 18230 + }, + { + "epoch": 4.81272271347499, + "grad_norm": 0.00274565233848989, + "learning_rate": 0.17073855289783238, + "loss": 0.0722, + "num_input_tokens_seen": 16492976, + "step": 18235 + }, + { + "epoch": 4.814042497030487, + "grad_norm": 0.0020654555410146713, + "learning_rate": 0.1706802121377066, + "loss": 0.0586, + "num_input_tokens_seen": 16497904, + "step": 18240 + }, + { + "epoch": 4.815362280585984, + "grad_norm": 0.0021568576339632273, + "learning_rate": 0.17062186818843225, + "loss": 0.0981, + "num_input_tokens_seen": 16502480, + "step": 18245 + }, + { + "epoch": 4.816682064141481, + "grad_norm": 0.0026473109610378742, + "learning_rate": 0.17056352105900668, + "loss": 0.0511, + "num_input_tokens_seen": 16507376, + "step": 18250 + }, + { + "epoch": 4.818001847696978, + "grad_norm": 0.0016252051573246717, + "learning_rate": 0.17050517075842772, + "loss": 0.0784, + "num_input_tokens_seen": 16511824, + "step": 18255 + }, + { + "epoch": 4.819321631252475, + "grad_norm": 0.001847064238972962, + "learning_rate": 0.17044681729569375, + "loss": 0.0732, + "num_input_tokens_seen": 16516176, + "step": 18260 + }, + { + "epoch": 4.820641414807971, + "grad_norm": 0.0027341996319592, + "learning_rate": 0.17038846067980365, + "loss": 0.0596, + "num_input_tokens_seen": 16520624, + "step": 18265 + }, + { + "epoch": 4.821961198363469, + "grad_norm": 0.001983017660677433, + "learning_rate": 0.17033010091975664, + "loss": 0.0573, + "num_input_tokens_seen": 16525360, + "step": 18270 + }, + { + "epoch": 4.823280981918965, + "grad_norm": 0.0008281688787974417, + "learning_rate": 0.17027173802455262, + "loss": 0.065, + "num_input_tokens_seen": 16529712, + "step": 18275 + }, + { + "epoch": 4.824600765474462, + "grad_norm": 0.002140807919204235, + "learning_rate": 0.1702133720031918, + "loss": 0.0928, + "num_input_tokens_seen": 16534000, + "step": 18280 + }, + { + "epoch": 4.825920549029959, + "grad_norm": 0.0024675633758306503, + "learning_rate": 0.17015500286467503, + "loss": 0.0583, + "num_input_tokens_seen": 16538768, + "step": 18285 + }, + { + "epoch": 4.827240332585456, + "grad_norm": 0.0014266219222918153, + "learning_rate": 0.17009663061800354, + "loss": 0.0427, + "num_input_tokens_seen": 16543344, + "step": 18290 + }, + { + "epoch": 4.828560116140953, + "grad_norm": 0.002264271257445216, + "learning_rate": 0.17003825527217903, + "loss": 0.0758, + "num_input_tokens_seen": 16547664, + "step": 18295 + }, + { + "epoch": 4.82987989969645, + "grad_norm": 0.002378166187554598, + "learning_rate": 0.16997987683620377, + "loss": 0.1106, + "num_input_tokens_seen": 16552016, + "step": 18300 + }, + { + "epoch": 4.831199683251946, + "grad_norm": 0.0014339271001517773, + "learning_rate": 0.16992149531908043, + "loss": 0.0809, + "num_input_tokens_seen": 16556592, + "step": 18305 + }, + { + "epoch": 4.832519466807444, + "grad_norm": 0.002125783357769251, + "learning_rate": 0.16986311072981214, + "loss": 0.0816, + "num_input_tokens_seen": 16561360, + "step": 18310 + }, + { + "epoch": 4.83383925036294, + "grad_norm": 0.000992152956314385, + "learning_rate": 0.16980472307740255, + "loss": 0.0535, + "num_input_tokens_seen": 16565712, + "step": 18315 + }, + { + "epoch": 4.835159033918437, + "grad_norm": 0.000661825411953032, + "learning_rate": 0.1697463323708558, + "loss": 0.0597, + "num_input_tokens_seen": 16570416, + "step": 18320 + }, + { + "epoch": 4.836478817473934, + "grad_norm": 0.001035422901622951, + "learning_rate": 0.16968793861917641, + "loss": 0.0968, + "num_input_tokens_seen": 16575216, + "step": 18325 + }, + { + "epoch": 4.837798601029431, + "grad_norm": 0.0020020294468849897, + "learning_rate": 0.16962954183136952, + "loss": 0.0685, + "num_input_tokens_seen": 16579856, + "step": 18330 + }, + { + "epoch": 4.839118384584928, + "grad_norm": 0.0029174492228776217, + "learning_rate": 0.16957114201644058, + "loss": 0.0545, + "num_input_tokens_seen": 16584048, + "step": 18335 + }, + { + "epoch": 4.840438168140425, + "grad_norm": 0.0006516139837913215, + "learning_rate": 0.16951273918339563, + "loss": 0.0519, + "num_input_tokens_seen": 16588688, + "step": 18340 + }, + { + "epoch": 4.8417579516959215, + "grad_norm": 0.0036374840419739485, + "learning_rate": 0.16945433334124105, + "loss": 0.0875, + "num_input_tokens_seen": 16593168, + "step": 18345 + }, + { + "epoch": 4.843077735251419, + "grad_norm": 0.0026246006600558758, + "learning_rate": 0.1693959244989838, + "loss": 0.0739, + "num_input_tokens_seen": 16597552, + "step": 18350 + }, + { + "epoch": 4.8443975188069155, + "grad_norm": 0.0017093103379011154, + "learning_rate": 0.16933751266563127, + "loss": 0.0717, + "num_input_tokens_seen": 16602064, + "step": 18355 + }, + { + "epoch": 4.845717302362413, + "grad_norm": 0.0026187642943114042, + "learning_rate": 0.16927909785019118, + "loss": 0.0952, + "num_input_tokens_seen": 16606416, + "step": 18360 + }, + { + "epoch": 4.8470370859179095, + "grad_norm": 0.0045749288983643055, + "learning_rate": 0.169220680061672, + "loss": 0.1273, + "num_input_tokens_seen": 16611024, + "step": 18365 + }, + { + "epoch": 4.848356869473406, + "grad_norm": 0.001909128506667912, + "learning_rate": 0.16916225930908244, + "loss": 0.0989, + "num_input_tokens_seen": 16615504, + "step": 18370 + }, + { + "epoch": 4.8496766530289035, + "grad_norm": 0.0014898855006322265, + "learning_rate": 0.16910383560143163, + "loss": 0.1028, + "num_input_tokens_seen": 16620112, + "step": 18375 + }, + { + "epoch": 4.8509964365844, + "grad_norm": 0.0021545407362282276, + "learning_rate": 0.16904540894772935, + "loss": 0.0413, + "num_input_tokens_seen": 16624464, + "step": 18380 + }, + { + "epoch": 4.8523162201398975, + "grad_norm": 0.0016114371828734875, + "learning_rate": 0.16898697935698562, + "loss": 0.0765, + "num_input_tokens_seen": 16629072, + "step": 18385 + }, + { + "epoch": 4.853636003695394, + "grad_norm": 0.001033620210364461, + "learning_rate": 0.1689285468382111, + "loss": 0.0971, + "num_input_tokens_seen": 16633488, + "step": 18390 + }, + { + "epoch": 4.854955787250891, + "grad_norm": 0.0017642303137108684, + "learning_rate": 0.16887011140041677, + "loss": 0.0624, + "num_input_tokens_seen": 16638064, + "step": 18395 + }, + { + "epoch": 4.856275570806388, + "grad_norm": 0.0016044306103140116, + "learning_rate": 0.1688116730526141, + "loss": 0.0679, + "num_input_tokens_seen": 16642640, + "step": 18400 + }, + { + "epoch": 4.856275570806388, + "eval_loss": 0.08945658802986145, + "eval_runtime": 76.0655, + "eval_samples_per_second": 88.542, + "eval_steps_per_second": 22.139, + "num_input_tokens_seen": 16642640, + "step": 18400 + }, + { + "epoch": 4.857595354361885, + "grad_norm": 0.0006080169696360826, + "learning_rate": 0.1687532318038151, + "loss": 0.0516, + "num_input_tokens_seen": 16647088, + "step": 18405 + }, + { + "epoch": 4.858915137917381, + "grad_norm": 0.0012035404797643423, + "learning_rate": 0.16869478766303206, + "loss": 0.082, + "num_input_tokens_seen": 16651888, + "step": 18410 + }, + { + "epoch": 4.860234921472879, + "grad_norm": 0.0011304658837616444, + "learning_rate": 0.16863634063927788, + "loss": 0.0504, + "num_input_tokens_seen": 16656368, + "step": 18415 + }, + { + "epoch": 4.861554705028375, + "grad_norm": 0.0014874952612444758, + "learning_rate": 0.16857789074156568, + "loss": 0.0833, + "num_input_tokens_seen": 16660784, + "step": 18420 + }, + { + "epoch": 4.862874488583873, + "grad_norm": 0.0012673420133069158, + "learning_rate": 0.16851943797890928, + "loss": 0.0451, + "num_input_tokens_seen": 16665328, + "step": 18425 + }, + { + "epoch": 4.864194272139369, + "grad_norm": 0.0018390222685411572, + "learning_rate": 0.16846098236032284, + "loss": 0.0596, + "num_input_tokens_seen": 16669744, + "step": 18430 + }, + { + "epoch": 4.865514055694866, + "grad_norm": 0.002289541531354189, + "learning_rate": 0.16840252389482097, + "loss": 0.0924, + "num_input_tokens_seen": 16674352, + "step": 18435 + }, + { + "epoch": 4.866833839250363, + "grad_norm": 0.0010582134127616882, + "learning_rate": 0.16834406259141857, + "loss": 0.0791, + "num_input_tokens_seen": 16678960, + "step": 18440 + }, + { + "epoch": 4.86815362280586, + "grad_norm": 0.002240949310362339, + "learning_rate": 0.16828559845913124, + "loss": 0.0856, + "num_input_tokens_seen": 16683440, + "step": 18445 + }, + { + "epoch": 4.869473406361356, + "grad_norm": 0.0015773698687553406, + "learning_rate": 0.16822713150697488, + "loss": 0.0805, + "num_input_tokens_seen": 16687952, + "step": 18450 + }, + { + "epoch": 4.870793189916854, + "grad_norm": 0.0020982457790523767, + "learning_rate": 0.16816866174396575, + "loss": 0.0441, + "num_input_tokens_seen": 16692752, + "step": 18455 + }, + { + "epoch": 4.87211297347235, + "grad_norm": 0.0019035543082281947, + "learning_rate": 0.16811018917912057, + "loss": 0.0881, + "num_input_tokens_seen": 16697264, + "step": 18460 + }, + { + "epoch": 4.873432757027848, + "grad_norm": 0.0009944867342710495, + "learning_rate": 0.16805171382145673, + "loss": 0.0501, + "num_input_tokens_seen": 16701776, + "step": 18465 + }, + { + "epoch": 4.874752540583344, + "grad_norm": 0.0004164550337009132, + "learning_rate": 0.16799323567999175, + "loss": 0.0402, + "num_input_tokens_seen": 16706480, + "step": 18470 + }, + { + "epoch": 4.876072324138841, + "grad_norm": 0.002368886023759842, + "learning_rate": 0.16793475476374367, + "loss": 0.0607, + "num_input_tokens_seen": 16711024, + "step": 18475 + }, + { + "epoch": 4.877392107694338, + "grad_norm": 0.001040761941112578, + "learning_rate": 0.1678762710817311, + "loss": 0.0778, + "num_input_tokens_seen": 16715504, + "step": 18480 + }, + { + "epoch": 4.878711891249835, + "grad_norm": 0.0018333066254854202, + "learning_rate": 0.1678177846429728, + "loss": 0.0825, + "num_input_tokens_seen": 16720048, + "step": 18485 + }, + { + "epoch": 4.880031674805332, + "grad_norm": 0.002893364056944847, + "learning_rate": 0.16775929545648827, + "loss": 0.1046, + "num_input_tokens_seen": 16724656, + "step": 18490 + }, + { + "epoch": 4.881351458360829, + "grad_norm": 0.0010446831583976746, + "learning_rate": 0.16770080353129715, + "loss": 0.0573, + "num_input_tokens_seen": 16729008, + "step": 18495 + }, + { + "epoch": 4.882671241916325, + "grad_norm": 0.002126034814864397, + "learning_rate": 0.16764230887641968, + "loss": 0.0743, + "num_input_tokens_seen": 16733776, + "step": 18500 + }, + { + "epoch": 4.883991025471823, + "grad_norm": 0.000450758496299386, + "learning_rate": 0.1675838115008765, + "loss": 0.0814, + "num_input_tokens_seen": 16738064, + "step": 18505 + }, + { + "epoch": 4.885310809027319, + "grad_norm": 0.0007980403024703264, + "learning_rate": 0.1675253114136886, + "loss": 0.0663, + "num_input_tokens_seen": 16742800, + "step": 18510 + }, + { + "epoch": 4.886630592582817, + "grad_norm": 0.002070349408313632, + "learning_rate": 0.16746680862387747, + "loss": 0.0546, + "num_input_tokens_seen": 16747312, + "step": 18515 + }, + { + "epoch": 4.887950376138313, + "grad_norm": 0.001265111262910068, + "learning_rate": 0.16740830314046493, + "loss": 0.0581, + "num_input_tokens_seen": 16751888, + "step": 18520 + }, + { + "epoch": 4.88927015969381, + "grad_norm": 0.0020758481696248055, + "learning_rate": 0.1673497949724733, + "loss": 0.0564, + "num_input_tokens_seen": 16756400, + "step": 18525 + }, + { + "epoch": 4.890589943249307, + "grad_norm": 0.00422595115378499, + "learning_rate": 0.16729128412892522, + "loss": 0.08, + "num_input_tokens_seen": 16760688, + "step": 18530 + }, + { + "epoch": 4.891909726804804, + "grad_norm": 0.0012035064864903688, + "learning_rate": 0.16723277061884384, + "loss": 0.0796, + "num_input_tokens_seen": 16765168, + "step": 18535 + }, + { + "epoch": 4.893229510360301, + "grad_norm": 0.002009538235142827, + "learning_rate": 0.16717425445125267, + "loss": 0.0752, + "num_input_tokens_seen": 16769904, + "step": 18540 + }, + { + "epoch": 4.894549293915798, + "grad_norm": 0.0032379906624555588, + "learning_rate": 0.16711573563517565, + "loss": 0.097, + "num_input_tokens_seen": 16774576, + "step": 18545 + }, + { + "epoch": 4.8958690774712945, + "grad_norm": 0.0021614613942801952, + "learning_rate": 0.1670572141796371, + "loss": 0.0678, + "num_input_tokens_seen": 16779152, + "step": 18550 + }, + { + "epoch": 4.897188861026792, + "grad_norm": 0.003629072569310665, + "learning_rate": 0.16699869009366175, + "loss": 0.1095, + "num_input_tokens_seen": 16783920, + "step": 18555 + }, + { + "epoch": 4.8985086445822885, + "grad_norm": 0.004073075484484434, + "learning_rate": 0.1669401633862748, + "loss": 0.1192, + "num_input_tokens_seen": 16788464, + "step": 18560 + }, + { + "epoch": 4.899828428137785, + "grad_norm": 0.0016803054604679346, + "learning_rate": 0.16688163406650178, + "loss": 0.0839, + "num_input_tokens_seen": 16793008, + "step": 18565 + }, + { + "epoch": 4.9011482116932825, + "grad_norm": 0.0015713756438344717, + "learning_rate": 0.1668231021433686, + "loss": 0.0775, + "num_input_tokens_seen": 16798064, + "step": 18570 + }, + { + "epoch": 4.902467995248779, + "grad_norm": 0.001667473348788917, + "learning_rate": 0.1667645676259017, + "loss": 0.0738, + "num_input_tokens_seen": 16802576, + "step": 18575 + }, + { + "epoch": 4.903787778804276, + "grad_norm": 0.0024565772619098425, + "learning_rate": 0.1667060305231277, + "loss": 0.0778, + "num_input_tokens_seen": 16807216, + "step": 18580 + }, + { + "epoch": 4.905107562359773, + "grad_norm": 0.0021207251120358706, + "learning_rate": 0.16664749084407396, + "loss": 0.0764, + "num_input_tokens_seen": 16811856, + "step": 18585 + }, + { + "epoch": 4.90642734591527, + "grad_norm": 0.00256923446431756, + "learning_rate": 0.16658894859776788, + "loss": 0.0817, + "num_input_tokens_seen": 16815952, + "step": 18590 + }, + { + "epoch": 4.907747129470767, + "grad_norm": 0.0012241494841873646, + "learning_rate": 0.16653040379323752, + "loss": 0.0989, + "num_input_tokens_seen": 16820464, + "step": 18595 + }, + { + "epoch": 4.909066913026264, + "grad_norm": 0.002088244305923581, + "learning_rate": 0.16647185643951107, + "loss": 0.0635, + "num_input_tokens_seen": 16825040, + "step": 18600 + }, + { + "epoch": 4.909066913026264, + "eval_loss": 0.09111309051513672, + "eval_runtime": 75.8615, + "eval_samples_per_second": 88.78, + "eval_steps_per_second": 22.198, + "num_input_tokens_seen": 16825040, + "step": 18600 + }, + { + "epoch": 4.91038669658176, + "grad_norm": 0.0017154806992039084, + "learning_rate": 0.1664133065456174, + "loss": 0.0566, + "num_input_tokens_seen": 16829584, + "step": 18605 + }, + { + "epoch": 4.911706480137258, + "grad_norm": 0.0016588176367804408, + "learning_rate": 0.1663547541205856, + "loss": 0.0799, + "num_input_tokens_seen": 16833968, + "step": 18610 + }, + { + "epoch": 4.913026263692754, + "grad_norm": 0.0019799056462943554, + "learning_rate": 0.16629619917344518, + "loss": 0.0743, + "num_input_tokens_seen": 16838736, + "step": 18615 + }, + { + "epoch": 4.914346047248252, + "grad_norm": 0.0016209085006266832, + "learning_rate": 0.16623764171322605, + "loss": 0.1007, + "num_input_tokens_seen": 16842992, + "step": 18620 + }, + { + "epoch": 4.915665830803748, + "grad_norm": 0.0018839322729036212, + "learning_rate": 0.1661790817489585, + "loss": 0.0825, + "num_input_tokens_seen": 16847632, + "step": 18625 + }, + { + "epoch": 4.916985614359245, + "grad_norm": 0.002594192046672106, + "learning_rate": 0.16612051928967328, + "loss": 0.0613, + "num_input_tokens_seen": 16852080, + "step": 18630 + }, + { + "epoch": 4.918305397914742, + "grad_norm": 0.0023548665922135115, + "learning_rate": 0.16606195434440138, + "loss": 0.0809, + "num_input_tokens_seen": 16856816, + "step": 18635 + }, + { + "epoch": 4.919625181470239, + "grad_norm": 0.0010135542834177613, + "learning_rate": 0.16600338692217426, + "loss": 0.0542, + "num_input_tokens_seen": 16861296, + "step": 18640 + }, + { + "epoch": 4.920944965025736, + "grad_norm": 0.0011221119202673435, + "learning_rate": 0.16594481703202374, + "loss": 0.0954, + "num_input_tokens_seen": 16865840, + "step": 18645 + }, + { + "epoch": 4.922264748581233, + "grad_norm": 0.0017228183569386601, + "learning_rate": 0.1658862446829821, + "loss": 0.0869, + "num_input_tokens_seen": 16870512, + "step": 18650 + }, + { + "epoch": 4.923584532136729, + "grad_norm": 0.001929003745317459, + "learning_rate": 0.16582766988408187, + "loss": 0.0655, + "num_input_tokens_seen": 16874768, + "step": 18655 + }, + { + "epoch": 4.924904315692227, + "grad_norm": 0.0020400346256792545, + "learning_rate": 0.16576909264435608, + "loss": 0.0933, + "num_input_tokens_seen": 16879440, + "step": 18660 + }, + { + "epoch": 4.926224099247723, + "grad_norm": 0.0016741999424993992, + "learning_rate": 0.16571051297283798, + "loss": 0.0492, + "num_input_tokens_seen": 16883824, + "step": 18665 + }, + { + "epoch": 4.927543882803221, + "grad_norm": 0.0010855571599677205, + "learning_rate": 0.16565193087856137, + "loss": 0.048, + "num_input_tokens_seen": 16887984, + "step": 18670 + }, + { + "epoch": 4.928863666358717, + "grad_norm": 0.0026918435469269753, + "learning_rate": 0.16559334637056033, + "loss": 0.0869, + "num_input_tokens_seen": 16892400, + "step": 18675 + }, + { + "epoch": 4.930183449914214, + "grad_norm": 0.0005801782244816422, + "learning_rate": 0.16553475945786933, + "loss": 0.0446, + "num_input_tokens_seen": 16896976, + "step": 18680 + }, + { + "epoch": 4.931503233469711, + "grad_norm": 0.0006958819576539099, + "learning_rate": 0.16547617014952318, + "loss": 0.0551, + "num_input_tokens_seen": 16901488, + "step": 18685 + }, + { + "epoch": 4.932823017025208, + "grad_norm": 0.0018438731785863638, + "learning_rate": 0.1654175784545571, + "loss": 0.0769, + "num_input_tokens_seen": 16906160, + "step": 18690 + }, + { + "epoch": 4.934142800580704, + "grad_norm": 0.0007989428122527897, + "learning_rate": 0.1653589843820067, + "loss": 0.0816, + "num_input_tokens_seen": 16910544, + "step": 18695 + }, + { + "epoch": 4.935462584136202, + "grad_norm": 0.0019339725840836763, + "learning_rate": 0.1653003879409079, + "loss": 0.0856, + "num_input_tokens_seen": 16914960, + "step": 18700 + }, + { + "epoch": 4.936782367691698, + "grad_norm": 0.0017138724215328693, + "learning_rate": 0.165241789140297, + "loss": 0.0866, + "num_input_tokens_seen": 16919504, + "step": 18705 + }, + { + "epoch": 4.938102151247195, + "grad_norm": 0.0016510967398062348, + "learning_rate": 0.16518318798921064, + "loss": 0.0993, + "num_input_tokens_seen": 16924016, + "step": 18710 + }, + { + "epoch": 4.939421934802692, + "grad_norm": 0.0010717567056417465, + "learning_rate": 0.16512458449668593, + "loss": 0.0817, + "num_input_tokens_seen": 16928752, + "step": 18715 + }, + { + "epoch": 4.940741718358189, + "grad_norm": 0.0012019937857985497, + "learning_rate": 0.1650659786717602, + "loss": 0.0688, + "num_input_tokens_seen": 16933424, + "step": 18720 + }, + { + "epoch": 4.942061501913686, + "grad_norm": 0.0012800591066479683, + "learning_rate": 0.1650073705234712, + "loss": 0.0937, + "num_input_tokens_seen": 16937936, + "step": 18725 + }, + { + "epoch": 4.943381285469183, + "grad_norm": 0.0003689168661367148, + "learning_rate": 0.16494876006085712, + "loss": 0.0743, + "num_input_tokens_seen": 16942832, + "step": 18730 + }, + { + "epoch": 4.9447010690246795, + "grad_norm": 0.0008487838786095381, + "learning_rate": 0.16489014729295634, + "loss": 0.0761, + "num_input_tokens_seen": 16947024, + "step": 18735 + }, + { + "epoch": 4.946020852580177, + "grad_norm": 0.0016795695992186666, + "learning_rate": 0.16483153222880775, + "loss": 0.102, + "num_input_tokens_seen": 16951760, + "step": 18740 + }, + { + "epoch": 4.9473406361356735, + "grad_norm": 0.003060796530917287, + "learning_rate": 0.16477291487745052, + "loss": 0.0531, + "num_input_tokens_seen": 16956528, + "step": 18745 + }, + { + "epoch": 4.948660419691171, + "grad_norm": 0.002079947618767619, + "learning_rate": 0.16471429524792416, + "loss": 0.0818, + "num_input_tokens_seen": 16961008, + "step": 18750 + }, + { + "epoch": 4.9499802032466675, + "grad_norm": 0.0008528900216333568, + "learning_rate": 0.16465567334926856, + "loss": 0.1246, + "num_input_tokens_seen": 16965648, + "step": 18755 + }, + { + "epoch": 4.951299986802164, + "grad_norm": 0.00034783402225002646, + "learning_rate": 0.16459704919052395, + "loss": 0.0478, + "num_input_tokens_seen": 16970032, + "step": 18760 + }, + { + "epoch": 4.9526197703576615, + "grad_norm": 0.001621920382604003, + "learning_rate": 0.16453842278073086, + "loss": 0.0863, + "num_input_tokens_seen": 16974640, + "step": 18765 + }, + { + "epoch": 4.953939553913158, + "grad_norm": 0.0031492807902395725, + "learning_rate": 0.16447979412893038, + "loss": 0.0739, + "num_input_tokens_seen": 16979120, + "step": 18770 + }, + { + "epoch": 4.9552593374686555, + "grad_norm": 0.0015792461344972253, + "learning_rate": 0.16442116324416367, + "loss": 0.0693, + "num_input_tokens_seen": 16983696, + "step": 18775 + }, + { + "epoch": 4.956579121024152, + "grad_norm": 0.0003894693509209901, + "learning_rate": 0.1643625301354723, + "loss": 0.0819, + "num_input_tokens_seen": 16988368, + "step": 18780 + }, + { + "epoch": 4.957898904579649, + "grad_norm": 0.0031089535914361477, + "learning_rate": 0.16430389481189828, + "loss": 0.0962, + "num_input_tokens_seen": 16992816, + "step": 18785 + }, + { + "epoch": 4.959218688135146, + "grad_norm": 0.002190509345382452, + "learning_rate": 0.164245257282484, + "loss": 0.0579, + "num_input_tokens_seen": 16997232, + "step": 18790 + }, + { + "epoch": 4.960538471690643, + "grad_norm": 0.002732687396928668, + "learning_rate": 0.16418661755627195, + "loss": 0.0986, + "num_input_tokens_seen": 17001744, + "step": 18795 + }, + { + "epoch": 4.96185825524614, + "grad_norm": 0.0019160310039296746, + "learning_rate": 0.16412797564230527, + "loss": 0.0637, + "num_input_tokens_seen": 17006160, + "step": 18800 + }, + { + "epoch": 4.96185825524614, + "eval_loss": 0.091304250061512, + "eval_runtime": 75.8518, + "eval_samples_per_second": 88.792, + "eval_steps_per_second": 22.201, + "num_input_tokens_seen": 17006160, + "step": 18800 + }, + { + "epoch": 4.963178038801637, + "grad_norm": 0.002278333529829979, + "learning_rate": 0.16406933154962713, + "loss": 0.0604, + "num_input_tokens_seen": 17010576, + "step": 18805 + }, + { + "epoch": 4.964497822357133, + "grad_norm": 0.002374243689700961, + "learning_rate": 0.16401068528728133, + "loss": 0.077, + "num_input_tokens_seen": 17015376, + "step": 18810 + }, + { + "epoch": 4.965817605912631, + "grad_norm": 0.00047488673590123653, + "learning_rate": 0.16395203686431173, + "loss": 0.0651, + "num_input_tokens_seen": 17020144, + "step": 18815 + }, + { + "epoch": 4.967137389468127, + "grad_norm": 0.0013168303994461894, + "learning_rate": 0.16389338628976277, + "loss": 0.0585, + "num_input_tokens_seen": 17024752, + "step": 18820 + }, + { + "epoch": 4.968457173023624, + "grad_norm": 0.0020907046273350716, + "learning_rate": 0.163834733572679, + "loss": 0.0705, + "num_input_tokens_seen": 17029392, + "step": 18825 + }, + { + "epoch": 4.969776956579121, + "grad_norm": 0.002623140113428235, + "learning_rate": 0.16377607872210545, + "loss": 0.098, + "num_input_tokens_seen": 17034032, + "step": 18830 + }, + { + "epoch": 4.971096740134618, + "grad_norm": 0.002363632433116436, + "learning_rate": 0.16371742174708748, + "loss": 0.048, + "num_input_tokens_seen": 17038576, + "step": 18835 + }, + { + "epoch": 4.972416523690114, + "grad_norm": 0.002178989350795746, + "learning_rate": 0.16365876265667065, + "loss": 0.0733, + "num_input_tokens_seen": 17042992, + "step": 18840 + }, + { + "epoch": 4.973736307245612, + "grad_norm": 0.0028187043499201536, + "learning_rate": 0.163600101459901, + "loss": 0.0948, + "num_input_tokens_seen": 17047376, + "step": 18845 + }, + { + "epoch": 4.975056090801108, + "grad_norm": 0.002221873728558421, + "learning_rate": 0.16354143816582484, + "loss": 0.0671, + "num_input_tokens_seen": 17051984, + "step": 18850 + }, + { + "epoch": 4.976375874356606, + "grad_norm": 0.0022822576574981213, + "learning_rate": 0.1634827727834887, + "loss": 0.082, + "num_input_tokens_seen": 17056528, + "step": 18855 + }, + { + "epoch": 4.977695657912102, + "grad_norm": 0.0013474369188770652, + "learning_rate": 0.16342410532193954, + "loss": 0.0559, + "num_input_tokens_seen": 17060848, + "step": 18860 + }, + { + "epoch": 4.979015441467599, + "grad_norm": 0.0023635532706975937, + "learning_rate": 0.16336543579022464, + "loss": 0.084, + "num_input_tokens_seen": 17065456, + "step": 18865 + }, + { + "epoch": 4.980335225023096, + "grad_norm": 0.0012072414392605424, + "learning_rate": 0.16330676419739157, + "loss": 0.0669, + "num_input_tokens_seen": 17069936, + "step": 18870 + }, + { + "epoch": 4.981655008578593, + "grad_norm": 0.0033006074372678995, + "learning_rate": 0.1632480905524883, + "loss": 0.0545, + "num_input_tokens_seen": 17074224, + "step": 18875 + }, + { + "epoch": 4.98297479213409, + "grad_norm": 0.001198813202790916, + "learning_rate": 0.16318941486456293, + "loss": 0.0528, + "num_input_tokens_seen": 17078800, + "step": 18880 + }, + { + "epoch": 4.984294575689587, + "grad_norm": 0.0032831791322678328, + "learning_rate": 0.16313073714266405, + "loss": 0.1166, + "num_input_tokens_seen": 17083472, + "step": 18885 + }, + { + "epoch": 4.9856143592450834, + "grad_norm": 0.0027082161977887154, + "learning_rate": 0.16307205739584052, + "loss": 0.0767, + "num_input_tokens_seen": 17087792, + "step": 18890 + }, + { + "epoch": 4.986934142800581, + "grad_norm": 0.0016435289289802313, + "learning_rate": 0.16301337563314144, + "loss": 0.0613, + "num_input_tokens_seen": 17092240, + "step": 18895 + }, + { + "epoch": 4.9882539263560775, + "grad_norm": 0.003147495212033391, + "learning_rate": 0.1629546918636163, + "loss": 0.0831, + "num_input_tokens_seen": 17096656, + "step": 18900 + }, + { + "epoch": 4.989573709911575, + "grad_norm": 0.0008915839134715497, + "learning_rate": 0.16289600609631485, + "loss": 0.0896, + "num_input_tokens_seen": 17101360, + "step": 18905 + }, + { + "epoch": 4.9908934934670715, + "grad_norm": 0.0015776327345520258, + "learning_rate": 0.16283731834028722, + "loss": 0.0782, + "num_input_tokens_seen": 17106032, + "step": 18910 + }, + { + "epoch": 4.992213277022568, + "grad_norm": 0.0036008867900818586, + "learning_rate": 0.16277862860458378, + "loss": 0.1044, + "num_input_tokens_seen": 17110480, + "step": 18915 + }, + { + "epoch": 4.9935330605780655, + "grad_norm": 0.002899588318541646, + "learning_rate": 0.16271993689825526, + "loss": 0.0967, + "num_input_tokens_seen": 17115248, + "step": 18920 + }, + { + "epoch": 4.994852844133562, + "grad_norm": 0.0012702655512839556, + "learning_rate": 0.1626612432303526, + "loss": 0.0777, + "num_input_tokens_seen": 17120112, + "step": 18925 + }, + { + "epoch": 4.9961726276890595, + "grad_norm": 0.0030111493542790413, + "learning_rate": 0.1626025476099271, + "loss": 0.1243, + "num_input_tokens_seen": 17124464, + "step": 18930 + }, + { + "epoch": 4.997492411244556, + "grad_norm": 0.0024840112309902906, + "learning_rate": 0.1625438500460304, + "loss": 0.0786, + "num_input_tokens_seen": 17128816, + "step": 18935 + }, + { + "epoch": 4.998812194800053, + "grad_norm": 0.001838306663557887, + "learning_rate": 0.16248515054771442, + "loss": 0.0952, + "num_input_tokens_seen": 17133296, + "step": 18940 + }, + { + "epoch": 5.0, + "grad_norm": 0.005599270574748516, + "learning_rate": 0.16242644912403123, + "loss": 0.0735, + "num_input_tokens_seen": 17137264, + "step": 18945 + }, + { + "epoch": 5.001319783555497, + "grad_norm": 0.0019872912671417, + "learning_rate": 0.1623677457840335, + "loss": 0.0608, + "num_input_tokens_seen": 17141904, + "step": 18950 + }, + { + "epoch": 5.002639567110994, + "grad_norm": 0.0015391549095511436, + "learning_rate": 0.16230904053677397, + "loss": 0.0446, + "num_input_tokens_seen": 17146512, + "step": 18955 + }, + { + "epoch": 5.003959350666491, + "grad_norm": 0.0005968199111521244, + "learning_rate": 0.16225033339130568, + "loss": 0.0493, + "num_input_tokens_seen": 17151056, + "step": 18960 + }, + { + "epoch": 5.005279134221988, + "grad_norm": 0.002786041237413883, + "learning_rate": 0.16219162435668197, + "loss": 0.0626, + "num_input_tokens_seen": 17155888, + "step": 18965 + }, + { + "epoch": 5.006598917777485, + "grad_norm": 0.002705338643863797, + "learning_rate": 0.16213291344195666, + "loss": 0.0713, + "num_input_tokens_seen": 17160336, + "step": 18970 + }, + { + "epoch": 5.007918701332981, + "grad_norm": 0.001890918705612421, + "learning_rate": 0.16207420065618358, + "loss": 0.07, + "num_input_tokens_seen": 17164976, + "step": 18975 + }, + { + "epoch": 5.009238484888479, + "grad_norm": 0.0011653256369754672, + "learning_rate": 0.16201548600841706, + "loss": 0.0482, + "num_input_tokens_seen": 17169392, + "step": 18980 + }, + { + "epoch": 5.010558268443975, + "grad_norm": 0.000705124344676733, + "learning_rate": 0.16195676950771154, + "loss": 0.0655, + "num_input_tokens_seen": 17174000, + "step": 18985 + }, + { + "epoch": 5.011878051999472, + "grad_norm": 0.0013669951586052775, + "learning_rate": 0.16189805116312198, + "loss": 0.0686, + "num_input_tokens_seen": 17178800, + "step": 18990 + }, + { + "epoch": 5.013197835554969, + "grad_norm": 0.0016432000556960702, + "learning_rate": 0.16183933098370337, + "loss": 0.0644, + "num_input_tokens_seen": 17183504, + "step": 18995 + }, + { + "epoch": 5.014517619110466, + "grad_norm": 0.0015480871079489589, + "learning_rate": 0.16178060897851115, + "loss": 0.066, + "num_input_tokens_seen": 17188336, + "step": 19000 + }, + { + "epoch": 5.014517619110466, + "eval_loss": 0.09089416265487671, + "eval_runtime": 75.8239, + "eval_samples_per_second": 88.824, + "eval_steps_per_second": 22.209, + "num_input_tokens_seen": 17188336, + "step": 19000 + }, + { + "epoch": 5.015837402665963, + "grad_norm": 0.002611062489449978, + "learning_rate": 0.16172188515660096, + "loss": 0.0652, + "num_input_tokens_seen": 17192688, + "step": 19005 + }, + { + "epoch": 5.01715718622146, + "grad_norm": 0.0028493436984717846, + "learning_rate": 0.16166315952702878, + "loss": 0.0719, + "num_input_tokens_seen": 17197328, + "step": 19010 + }, + { + "epoch": 5.018476969776956, + "grad_norm": 0.0018225215608254075, + "learning_rate": 0.16160443209885084, + "loss": 0.0831, + "num_input_tokens_seen": 17201840, + "step": 19015 + }, + { + "epoch": 5.019796753332454, + "grad_norm": 0.002437033224850893, + "learning_rate": 0.16154570288112363, + "loss": 0.0657, + "num_input_tokens_seen": 17206576, + "step": 19020 + }, + { + "epoch": 5.02111653688795, + "grad_norm": 0.000758218637201935, + "learning_rate": 0.16148697188290395, + "loss": 0.0435, + "num_input_tokens_seen": 17210960, + "step": 19025 + }, + { + "epoch": 5.022436320443448, + "grad_norm": 0.0020901185926049948, + "learning_rate": 0.16142823911324888, + "loss": 0.0487, + "num_input_tokens_seen": 17215664, + "step": 19030 + }, + { + "epoch": 5.023756103998944, + "grad_norm": 0.0030184367205947638, + "learning_rate": 0.16136950458121568, + "loss": 0.0659, + "num_input_tokens_seen": 17220080, + "step": 19035 + }, + { + "epoch": 5.025075887554441, + "grad_norm": 0.0008248041267506778, + "learning_rate": 0.16131076829586205, + "loss": 0.0438, + "num_input_tokens_seen": 17224432, + "step": 19040 + }, + { + "epoch": 5.026395671109938, + "grad_norm": 0.00041158479871228337, + "learning_rate": 0.1612520302662457, + "loss": 0.036, + "num_input_tokens_seen": 17228912, + "step": 19045 + }, + { + "epoch": 5.027715454665435, + "grad_norm": 0.0032532995101064444, + "learning_rate": 0.16119329050142497, + "loss": 0.0707, + "num_input_tokens_seen": 17233392, + "step": 19050 + }, + { + "epoch": 5.029035238220931, + "grad_norm": 0.0006571351550519466, + "learning_rate": 0.16113454901045818, + "loss": 0.0326, + "num_input_tokens_seen": 17238000, + "step": 19055 + }, + { + "epoch": 5.030355021776429, + "grad_norm": 0.002269914373755455, + "learning_rate": 0.16107580580240397, + "loss": 0.0813, + "num_input_tokens_seen": 17242640, + "step": 19060 + }, + { + "epoch": 5.031674805331925, + "grad_norm": 0.0021539260633289814, + "learning_rate": 0.16101706088632134, + "loss": 0.061, + "num_input_tokens_seen": 17247088, + "step": 19065 + }, + { + "epoch": 5.032994588887423, + "grad_norm": 0.0016086060786619782, + "learning_rate": 0.16095831427126947, + "loss": 0.0765, + "num_input_tokens_seen": 17251600, + "step": 19070 + }, + { + "epoch": 5.034314372442919, + "grad_norm": 0.00040413817623630166, + "learning_rate": 0.16089956596630783, + "loss": 0.0385, + "num_input_tokens_seen": 17256336, + "step": 19075 + }, + { + "epoch": 5.035634155998416, + "grad_norm": 0.0016635028878226876, + "learning_rate": 0.16084081598049618, + "loss": 0.0503, + "num_input_tokens_seen": 17260784, + "step": 19080 + }, + { + "epoch": 5.036953939553913, + "grad_norm": 0.002017208142206073, + "learning_rate": 0.1607820643228944, + "loss": 0.0411, + "num_input_tokens_seen": 17265136, + "step": 19085 + }, + { + "epoch": 5.03827372310941, + "grad_norm": 0.004801803268492222, + "learning_rate": 0.16072331100256285, + "loss": 0.0703, + "num_input_tokens_seen": 17270096, + "step": 19090 + }, + { + "epoch": 5.039593506664907, + "grad_norm": 0.00046437393757514656, + "learning_rate": 0.16066455602856197, + "loss": 0.0716, + "num_input_tokens_seen": 17274768, + "step": 19095 + }, + { + "epoch": 5.040913290220404, + "grad_norm": 0.002064286731183529, + "learning_rate": 0.16060579940995257, + "loss": 0.0407, + "num_input_tokens_seen": 17279248, + "step": 19100 + }, + { + "epoch": 5.0422330737759005, + "grad_norm": 0.0010968627175316215, + "learning_rate": 0.16054704115579557, + "loss": 0.0508, + "num_input_tokens_seen": 17283568, + "step": 19105 + }, + { + "epoch": 5.043552857331398, + "grad_norm": 0.0027624377980828285, + "learning_rate": 0.1604882812751523, + "loss": 0.0791, + "num_input_tokens_seen": 17288016, + "step": 19110 + }, + { + "epoch": 5.0448726408868945, + "grad_norm": 0.0006248831050470471, + "learning_rate": 0.16042951977708425, + "loss": 0.0485, + "num_input_tokens_seen": 17292144, + "step": 19115 + }, + { + "epoch": 5.046192424442391, + "grad_norm": 0.0005336939939297736, + "learning_rate": 0.16037075667065318, + "loss": 0.0696, + "num_input_tokens_seen": 17296848, + "step": 19120 + }, + { + "epoch": 5.0475122079978885, + "grad_norm": 0.0016741595463827252, + "learning_rate": 0.1603119919649211, + "loss": 0.0897, + "num_input_tokens_seen": 17301360, + "step": 19125 + }, + { + "epoch": 5.048831991553385, + "grad_norm": 0.002836592961102724, + "learning_rate": 0.16025322566895028, + "loss": 0.0554, + "num_input_tokens_seen": 17305744, + "step": 19130 + }, + { + "epoch": 5.0501517751088825, + "grad_norm": 0.0017752702115103602, + "learning_rate": 0.16019445779180322, + "loss": 0.0587, + "num_input_tokens_seen": 17310736, + "step": 19135 + }, + { + "epoch": 5.051471558664379, + "grad_norm": 0.0017123221186921, + "learning_rate": 0.16013568834254271, + "loss": 0.0501, + "num_input_tokens_seen": 17315120, + "step": 19140 + }, + { + "epoch": 5.052791342219876, + "grad_norm": 0.002870164578780532, + "learning_rate": 0.1600769173302316, + "loss": 0.0715, + "num_input_tokens_seen": 17319280, + "step": 19145 + }, + { + "epoch": 5.054111125775373, + "grad_norm": 0.0015798399690538645, + "learning_rate": 0.16001814476393322, + "loss": 0.07, + "num_input_tokens_seen": 17324048, + "step": 19150 + }, + { + "epoch": 5.05543090933087, + "grad_norm": 0.00047532006283290684, + "learning_rate": 0.15995937065271104, + "loss": 0.0659, + "num_input_tokens_seen": 17328464, + "step": 19155 + }, + { + "epoch": 5.056750692886367, + "grad_norm": 0.0010050006676465273, + "learning_rate": 0.15990059500562873, + "loss": 0.061, + "num_input_tokens_seen": 17333040, + "step": 19160 + }, + { + "epoch": 5.058070476441864, + "grad_norm": 0.0012765193823724985, + "learning_rate": 0.15984181783175025, + "loss": 0.0557, + "num_input_tokens_seen": 17337744, + "step": 19165 + }, + { + "epoch": 5.05939025999736, + "grad_norm": 0.0028029235545545816, + "learning_rate": 0.1597830391401398, + "loss": 0.0773, + "num_input_tokens_seen": 17342256, + "step": 19170 + }, + { + "epoch": 5.060710043552858, + "grad_norm": 0.004406547173857689, + "learning_rate": 0.15972425893986178, + "loss": 0.0824, + "num_input_tokens_seen": 17346800, + "step": 19175 + }, + { + "epoch": 5.062029827108354, + "grad_norm": 0.0032015934120863676, + "learning_rate": 0.15966547723998084, + "loss": 0.0858, + "num_input_tokens_seen": 17351344, + "step": 19180 + }, + { + "epoch": 5.063349610663851, + "grad_norm": 0.0015180091140791774, + "learning_rate": 0.15960669404956176, + "loss": 0.0577, + "num_input_tokens_seen": 17355920, + "step": 19185 + }, + { + "epoch": 5.064669394219348, + "grad_norm": 0.002898639300838113, + "learning_rate": 0.1595479093776698, + "loss": 0.0717, + "num_input_tokens_seen": 17360208, + "step": 19190 + }, + { + "epoch": 5.065989177774845, + "grad_norm": 0.002402796410024166, + "learning_rate": 0.15948912323337022, + "loss": 0.0567, + "num_input_tokens_seen": 17364528, + "step": 19195 + }, + { + "epoch": 5.067308961330342, + "grad_norm": 0.0009117604931816459, + "learning_rate": 0.1594303356257286, + "loss": 0.0575, + "num_input_tokens_seen": 17369104, + "step": 19200 + }, + { + "epoch": 5.067308961330342, + "eval_loss": 0.09343168884515762, + "eval_runtime": 75.911, + "eval_samples_per_second": 88.722, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 17369104, + "step": 19200 + }, + { + "epoch": 5.068628744885839, + "grad_norm": 0.0018382285488769412, + "learning_rate": 0.15937154656381072, + "loss": 0.0683, + "num_input_tokens_seen": 17373456, + "step": 19205 + }, + { + "epoch": 5.069948528441335, + "grad_norm": 0.000989733962342143, + "learning_rate": 0.15931275605668258, + "loss": 0.0525, + "num_input_tokens_seen": 17378160, + "step": 19210 + }, + { + "epoch": 5.071268311996833, + "grad_norm": 0.002561921253800392, + "learning_rate": 0.1592539641134104, + "loss": 0.0421, + "num_input_tokens_seen": 17382704, + "step": 19215 + }, + { + "epoch": 5.072588095552329, + "grad_norm": 0.0013015762669965625, + "learning_rate": 0.1591951707430607, + "loss": 0.0461, + "num_input_tokens_seen": 17387472, + "step": 19220 + }, + { + "epoch": 5.073907879107827, + "grad_norm": 0.00167604791931808, + "learning_rate": 0.15913637595470007, + "loss": 0.0597, + "num_input_tokens_seen": 17391856, + "step": 19225 + }, + { + "epoch": 5.075227662663323, + "grad_norm": 0.0029449041467159986, + "learning_rate": 0.15907757975739548, + "loss": 0.052, + "num_input_tokens_seen": 17396528, + "step": 19230 + }, + { + "epoch": 5.07654744621882, + "grad_norm": 0.0010556384222581983, + "learning_rate": 0.159018782160214, + "loss": 0.0501, + "num_input_tokens_seen": 17401104, + "step": 19235 + }, + { + "epoch": 5.077867229774317, + "grad_norm": 0.001804729225113988, + "learning_rate": 0.158959983172223, + "loss": 0.0747, + "num_input_tokens_seen": 17405488, + "step": 19240 + }, + { + "epoch": 5.079187013329814, + "grad_norm": 0.00365358660928905, + "learning_rate": 0.15890118280249, + "loss": 0.0723, + "num_input_tokens_seen": 17409968, + "step": 19245 + }, + { + "epoch": 5.08050679688531, + "grad_norm": 0.0024176952429115772, + "learning_rate": 0.15884238106008275, + "loss": 0.0543, + "num_input_tokens_seen": 17414256, + "step": 19250 + }, + { + "epoch": 5.081826580440808, + "grad_norm": 0.004093716852366924, + "learning_rate": 0.15878357795406922, + "loss": 0.0661, + "num_input_tokens_seen": 17418704, + "step": 19255 + }, + { + "epoch": 5.083146363996304, + "grad_norm": 0.0030340959783643484, + "learning_rate": 0.15872477349351757, + "loss": 0.0696, + "num_input_tokens_seen": 17423056, + "step": 19260 + }, + { + "epoch": 5.084466147551802, + "grad_norm": 0.0023080676328390837, + "learning_rate": 0.15866596768749622, + "loss": 0.0613, + "num_input_tokens_seen": 17427344, + "step": 19265 + }, + { + "epoch": 5.085785931107298, + "grad_norm": 0.0029139965772628784, + "learning_rate": 0.15860716054507373, + "loss": 0.0605, + "num_input_tokens_seen": 17431984, + "step": 19270 + }, + { + "epoch": 5.087105714662795, + "grad_norm": 0.001785807777196169, + "learning_rate": 0.1585483520753189, + "loss": 0.0639, + "num_input_tokens_seen": 17436880, + "step": 19275 + }, + { + "epoch": 5.088425498218292, + "grad_norm": 0.001204600092023611, + "learning_rate": 0.1584895422873008, + "loss": 0.0426, + "num_input_tokens_seen": 17441296, + "step": 19280 + }, + { + "epoch": 5.089745281773789, + "grad_norm": 0.0017795447492972016, + "learning_rate": 0.1584307311900886, + "loss": 0.0602, + "num_input_tokens_seen": 17445840, + "step": 19285 + }, + { + "epoch": 5.091065065329286, + "grad_norm": 0.0004239033441990614, + "learning_rate": 0.1583719187927517, + "loss": 0.0432, + "num_input_tokens_seen": 17450704, + "step": 19290 + }, + { + "epoch": 5.092384848884783, + "grad_norm": 0.0021388332825154066, + "learning_rate": 0.15831310510435967, + "loss": 0.0542, + "num_input_tokens_seen": 17455344, + "step": 19295 + }, + { + "epoch": 5.0937046324402795, + "grad_norm": 0.0014877936337143183, + "learning_rate": 0.15825429013398243, + "loss": 0.036, + "num_input_tokens_seen": 17459920, + "step": 19300 + }, + { + "epoch": 5.095024415995777, + "grad_norm": 0.0021860136184841394, + "learning_rate": 0.15819547389068986, + "loss": 0.0575, + "num_input_tokens_seen": 17464656, + "step": 19305 + }, + { + "epoch": 5.0963441995512735, + "grad_norm": 0.0010760592995211482, + "learning_rate": 0.1581366563835522, + "loss": 0.0379, + "num_input_tokens_seen": 17469072, + "step": 19310 + }, + { + "epoch": 5.09766398310677, + "grad_norm": 0.0009695332846604288, + "learning_rate": 0.15807783762163993, + "loss": 0.065, + "num_input_tokens_seen": 17473488, + "step": 19315 + }, + { + "epoch": 5.0989837666622675, + "grad_norm": 0.0011577383847907186, + "learning_rate": 0.15801901761402365, + "loss": 0.029, + "num_input_tokens_seen": 17477712, + "step": 19320 + }, + { + "epoch": 5.100303550217764, + "grad_norm": 0.00350553379394114, + "learning_rate": 0.157960196369774, + "loss": 0.0921, + "num_input_tokens_seen": 17481936, + "step": 19325 + }, + { + "epoch": 5.1016233337732615, + "grad_norm": 0.0022977504413574934, + "learning_rate": 0.157901373897962, + "loss": 0.0434, + "num_input_tokens_seen": 17486608, + "step": 19330 + }, + { + "epoch": 5.102943117328758, + "grad_norm": 0.0026541566476225853, + "learning_rate": 0.15784255020765892, + "loss": 0.0691, + "num_input_tokens_seen": 17491024, + "step": 19335 + }, + { + "epoch": 5.104262900884255, + "grad_norm": 0.0008568124612793326, + "learning_rate": 0.157783725307936, + "loss": 0.0345, + "num_input_tokens_seen": 17495600, + "step": 19340 + }, + { + "epoch": 5.105582684439752, + "grad_norm": 0.001815039082430303, + "learning_rate": 0.15772489920786484, + "loss": 0.061, + "num_input_tokens_seen": 17500144, + "step": 19345 + }, + { + "epoch": 5.106902467995249, + "grad_norm": 0.0034983684308826923, + "learning_rate": 0.15766607191651713, + "loss": 0.0652, + "num_input_tokens_seen": 17504592, + "step": 19350 + }, + { + "epoch": 5.108222251550746, + "grad_norm": 0.0020048075821250677, + "learning_rate": 0.1576072434429648, + "loss": 0.0878, + "num_input_tokens_seen": 17509200, + "step": 19355 + }, + { + "epoch": 5.109542035106243, + "grad_norm": 0.0010843102354556322, + "learning_rate": 0.15754841379627998, + "loss": 0.0374, + "num_input_tokens_seen": 17514160, + "step": 19360 + }, + { + "epoch": 5.110861818661739, + "grad_norm": 0.0016484004445374012, + "learning_rate": 0.15748958298553484, + "loss": 0.0368, + "num_input_tokens_seen": 17518896, + "step": 19365 + }, + { + "epoch": 5.112181602217237, + "grad_norm": 0.003528967732563615, + "learning_rate": 0.1574307510198019, + "loss": 0.0868, + "num_input_tokens_seen": 17523504, + "step": 19370 + }, + { + "epoch": 5.113501385772733, + "grad_norm": 0.0035256301052868366, + "learning_rate": 0.15737191790815375, + "loss": 0.071, + "num_input_tokens_seen": 17527760, + "step": 19375 + }, + { + "epoch": 5.114821169328231, + "grad_norm": 0.001226901076734066, + "learning_rate": 0.15731308365966323, + "loss": 0.1014, + "num_input_tokens_seen": 17532016, + "step": 19380 + }, + { + "epoch": 5.116140952883727, + "grad_norm": 0.0016174068441614509, + "learning_rate": 0.15725424828340331, + "loss": 0.0805, + "num_input_tokens_seen": 17536400, + "step": 19385 + }, + { + "epoch": 5.117460736439224, + "grad_norm": 0.0010296449763700366, + "learning_rate": 0.15719541178844715, + "loss": 0.0467, + "num_input_tokens_seen": 17540816, + "step": 19390 + }, + { + "epoch": 5.118780519994721, + "grad_norm": 0.0017082421109080315, + "learning_rate": 0.15713657418386806, + "loss": 0.0649, + "num_input_tokens_seen": 17545680, + "step": 19395 + }, + { + "epoch": 5.120100303550218, + "grad_norm": 0.0013072972651571035, + "learning_rate": 0.15707773547873957, + "loss": 0.0478, + "num_input_tokens_seen": 17549968, + "step": 19400 + }, + { + "epoch": 5.120100303550218, + "eval_loss": 0.09013891220092773, + "eval_runtime": 75.9691, + "eval_samples_per_second": 88.654, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 17549968, + "step": 19400 + }, + { + "epoch": 5.121420087105714, + "grad_norm": 0.003562075085937977, + "learning_rate": 0.1570188956821353, + "loss": 0.0635, + "num_input_tokens_seen": 17554416, + "step": 19405 + }, + { + "epoch": 5.122739870661212, + "grad_norm": 0.0015855201054364443, + "learning_rate": 0.1569600548031291, + "loss": 0.0413, + "num_input_tokens_seen": 17559312, + "step": 19410 + }, + { + "epoch": 5.124059654216708, + "grad_norm": 0.0024157948791980743, + "learning_rate": 0.156901212850795, + "loss": 0.0631, + "num_input_tokens_seen": 17563888, + "step": 19415 + }, + { + "epoch": 5.125379437772206, + "grad_norm": 0.0005380894872359931, + "learning_rate": 0.15684236983420716, + "loss": 0.0714, + "num_input_tokens_seen": 17568464, + "step": 19420 + }, + { + "epoch": 5.126699221327702, + "grad_norm": 0.001987625379115343, + "learning_rate": 0.1567835257624399, + "loss": 0.0441, + "num_input_tokens_seen": 17572944, + "step": 19425 + }, + { + "epoch": 5.128019004883199, + "grad_norm": 0.0028678271919488907, + "learning_rate": 0.1567246806445677, + "loss": 0.086, + "num_input_tokens_seen": 17577456, + "step": 19430 + }, + { + "epoch": 5.129338788438696, + "grad_norm": 0.004006334114819765, + "learning_rate": 0.15666583448966526, + "loss": 0.0737, + "num_input_tokens_seen": 17581968, + "step": 19435 + }, + { + "epoch": 5.130658571994193, + "grad_norm": 0.000725860649254173, + "learning_rate": 0.1566069873068074, + "loss": 0.0691, + "num_input_tokens_seen": 17586512, + "step": 19440 + }, + { + "epoch": 5.131978355549689, + "grad_norm": 0.0017176733817905188, + "learning_rate": 0.156548139105069, + "loss": 0.0667, + "num_input_tokens_seen": 17591056, + "step": 19445 + }, + { + "epoch": 5.133298139105187, + "grad_norm": 0.0023242379538714886, + "learning_rate": 0.15648928989352529, + "loss": 0.0653, + "num_input_tokens_seen": 17595408, + "step": 19450 + }, + { + "epoch": 5.134617922660683, + "grad_norm": 0.0016284467419609427, + "learning_rate": 0.15643043968125156, + "loss": 0.0803, + "num_input_tokens_seen": 17600112, + "step": 19455 + }, + { + "epoch": 5.135937706216181, + "grad_norm": 0.0018911658553406596, + "learning_rate": 0.15637158847732316, + "loss": 0.0359, + "num_input_tokens_seen": 17604848, + "step": 19460 + }, + { + "epoch": 5.137257489771677, + "grad_norm": 0.003406879026442766, + "learning_rate": 0.15631273629081582, + "loss": 0.0826, + "num_input_tokens_seen": 17609424, + "step": 19465 + }, + { + "epoch": 5.138577273327174, + "grad_norm": 0.0018104281043633819, + "learning_rate": 0.15625388313080518, + "loss": 0.0536, + "num_input_tokens_seen": 17613872, + "step": 19470 + }, + { + "epoch": 5.139897056882671, + "grad_norm": 0.0020089768804609776, + "learning_rate": 0.15619502900636714, + "loss": 0.064, + "num_input_tokens_seen": 17618352, + "step": 19475 + }, + { + "epoch": 5.141216840438168, + "grad_norm": 0.0018953137332573533, + "learning_rate": 0.15613617392657783, + "loss": 0.0634, + "num_input_tokens_seen": 17622768, + "step": 19480 + }, + { + "epoch": 5.142536623993665, + "grad_norm": 0.003165459493175149, + "learning_rate": 0.15607731790051335, + "loss": 0.0587, + "num_input_tokens_seen": 17627280, + "step": 19485 + }, + { + "epoch": 5.143856407549162, + "grad_norm": 0.0031565583776682615, + "learning_rate": 0.15601846093725008, + "loss": 0.0706, + "num_input_tokens_seen": 17631440, + "step": 19490 + }, + { + "epoch": 5.1451761911046585, + "grad_norm": 0.0015405830927193165, + "learning_rate": 0.1559596030458645, + "loss": 0.0575, + "num_input_tokens_seen": 17636080, + "step": 19495 + }, + { + "epoch": 5.146495974660156, + "grad_norm": 0.0029934532940387726, + "learning_rate": 0.1559007442354333, + "loss": 0.0859, + "num_input_tokens_seen": 17640976, + "step": 19500 + }, + { + "epoch": 5.1478157582156525, + "grad_norm": 0.0026672882959246635, + "learning_rate": 0.15584188451503314, + "loss": 0.0409, + "num_input_tokens_seen": 17645552, + "step": 19505 + }, + { + "epoch": 5.14913554177115, + "grad_norm": 0.004464342724531889, + "learning_rate": 0.15578302389374094, + "loss": 0.0838, + "num_input_tokens_seen": 17649872, + "step": 19510 + }, + { + "epoch": 5.1504553253266465, + "grad_norm": 0.002351503586396575, + "learning_rate": 0.1557241623806338, + "loss": 0.0636, + "num_input_tokens_seen": 17654224, + "step": 19515 + }, + { + "epoch": 5.151775108882143, + "grad_norm": 0.0012500790180638433, + "learning_rate": 0.15566529998478887, + "loss": 0.0694, + "num_input_tokens_seen": 17658896, + "step": 19520 + }, + { + "epoch": 5.1530948924376405, + "grad_norm": 0.0010138206416741014, + "learning_rate": 0.15560643671528354, + "loss": 0.0385, + "num_input_tokens_seen": 17663248, + "step": 19525 + }, + { + "epoch": 5.154414675993137, + "grad_norm": 0.001937060966156423, + "learning_rate": 0.15554757258119514, + "loss": 0.0557, + "num_input_tokens_seen": 17667536, + "step": 19530 + }, + { + "epoch": 5.155734459548634, + "grad_norm": 0.000723449164070189, + "learning_rate": 0.1554887075916014, + "loss": 0.0344, + "num_input_tokens_seen": 17671920, + "step": 19535 + }, + { + "epoch": 5.157054243104131, + "grad_norm": 0.002919472986832261, + "learning_rate": 0.15542984175558, + "loss": 0.0608, + "num_input_tokens_seen": 17676624, + "step": 19540 + }, + { + "epoch": 5.158374026659628, + "grad_norm": 0.003560788929462433, + "learning_rate": 0.1553709750822087, + "loss": 0.0594, + "num_input_tokens_seen": 17681040, + "step": 19545 + }, + { + "epoch": 5.159693810215125, + "grad_norm": 0.0024379128590226173, + "learning_rate": 0.15531210758056554, + "loss": 0.0568, + "num_input_tokens_seen": 17685584, + "step": 19550 + }, + { + "epoch": 5.161013593770622, + "grad_norm": 0.001475768513046205, + "learning_rate": 0.15525323925972867, + "loss": 0.0575, + "num_input_tokens_seen": 17689904, + "step": 19555 + }, + { + "epoch": 5.162333377326118, + "grad_norm": 0.0017463164404034615, + "learning_rate": 0.15519437012877627, + "loss": 0.0703, + "num_input_tokens_seen": 17694480, + "step": 19560 + }, + { + "epoch": 5.163653160881616, + "grad_norm": 0.0038941078819334507, + "learning_rate": 0.15513550019678676, + "loss": 0.0658, + "num_input_tokens_seen": 17698832, + "step": 19565 + }, + { + "epoch": 5.164972944437112, + "grad_norm": 0.004748775158077478, + "learning_rate": 0.15507662947283854, + "loss": 0.0704, + "num_input_tokens_seen": 17703120, + "step": 19570 + }, + { + "epoch": 5.16629272799261, + "grad_norm": 0.0024005838204175234, + "learning_rate": 0.15501775796601028, + "loss": 0.055, + "num_input_tokens_seen": 17707536, + "step": 19575 + }, + { + "epoch": 5.167612511548106, + "grad_norm": 0.002185491845011711, + "learning_rate": 0.15495888568538066, + "loss": 0.0786, + "num_input_tokens_seen": 17711888, + "step": 19580 + }, + { + "epoch": 5.168932295103603, + "grad_norm": 0.0020229287911206484, + "learning_rate": 0.1549000126400286, + "loss": 0.0608, + "num_input_tokens_seen": 17716592, + "step": 19585 + }, + { + "epoch": 5.1702520786591, + "grad_norm": 0.004081906285136938, + "learning_rate": 0.15484113883903294, + "loss": 0.1262, + "num_input_tokens_seen": 17720976, + "step": 19590 + }, + { + "epoch": 5.171571862214597, + "grad_norm": 0.0016208807937800884, + "learning_rate": 0.15478226429147288, + "loss": 0.0494, + "num_input_tokens_seen": 17725488, + "step": 19595 + }, + { + "epoch": 5.172891645770093, + "grad_norm": 0.0029528848826885223, + "learning_rate": 0.15472338900642757, + "loss": 0.0693, + "num_input_tokens_seen": 17730032, + "step": 19600 + }, + { + "epoch": 5.172891645770093, + "eval_loss": 0.09193611890077591, + "eval_runtime": 75.9637, + "eval_samples_per_second": 88.661, + "eval_steps_per_second": 22.168, + "num_input_tokens_seen": 17730032, + "step": 19600 + }, + { + "epoch": 5.174211429325591, + "grad_norm": 0.0019571322482079268, + "learning_rate": 0.15466451299297632, + "loss": 0.0438, + "num_input_tokens_seen": 17734608, + "step": 19605 + }, + { + "epoch": 5.175531212881087, + "grad_norm": 0.001084325835108757, + "learning_rate": 0.15460563626019852, + "loss": 0.0686, + "num_input_tokens_seen": 17739088, + "step": 19610 + }, + { + "epoch": 5.176850996436585, + "grad_norm": 0.002222332637757063, + "learning_rate": 0.15454675881717375, + "loss": 0.0465, + "num_input_tokens_seen": 17743824, + "step": 19615 + }, + { + "epoch": 5.178170779992081, + "grad_norm": 0.0029871941078454256, + "learning_rate": 0.1544878806729816, + "loss": 0.0633, + "num_input_tokens_seen": 17748016, + "step": 19620 + }, + { + "epoch": 5.179490563547578, + "grad_norm": 0.0007607754087075591, + "learning_rate": 0.1544290018367019, + "loss": 0.0392, + "num_input_tokens_seen": 17752816, + "step": 19625 + }, + { + "epoch": 5.180810347103075, + "grad_norm": 0.002369154943153262, + "learning_rate": 0.15437012231741445, + "loss": 0.0677, + "num_input_tokens_seen": 17757232, + "step": 19630 + }, + { + "epoch": 5.182130130658572, + "grad_norm": 0.00068361108424142, + "learning_rate": 0.1543112421241992, + "loss": 0.0538, + "num_input_tokens_seen": 17761776, + "step": 19635 + }, + { + "epoch": 5.183449914214069, + "grad_norm": 0.0018967061769217253, + "learning_rate": 0.15425236126613626, + "loss": 0.0428, + "num_input_tokens_seen": 17766128, + "step": 19640 + }, + { + "epoch": 5.184769697769566, + "grad_norm": 0.004017968196421862, + "learning_rate": 0.15419347975230577, + "loss": 0.0591, + "num_input_tokens_seen": 17770768, + "step": 19645 + }, + { + "epoch": 5.186089481325062, + "grad_norm": 0.0013114714529365301, + "learning_rate": 0.154134597591788, + "loss": 0.1353, + "num_input_tokens_seen": 17775152, + "step": 19650 + }, + { + "epoch": 5.18740926488056, + "grad_norm": 0.0026044538244605064, + "learning_rate": 0.1540757147936633, + "loss": 0.0706, + "num_input_tokens_seen": 17779760, + "step": 19655 + }, + { + "epoch": 5.188729048436056, + "grad_norm": 0.0023608712945133448, + "learning_rate": 0.1540168313670122, + "loss": 0.0627, + "num_input_tokens_seen": 17784016, + "step": 19660 + }, + { + "epoch": 5.190048831991553, + "grad_norm": 0.0022661047987639904, + "learning_rate": 0.1539579473209152, + "loss": 0.0986, + "num_input_tokens_seen": 17788944, + "step": 19665 + }, + { + "epoch": 5.19136861554705, + "grad_norm": 0.0012418980477377772, + "learning_rate": 0.15389906266445294, + "loss": 0.0438, + "num_input_tokens_seen": 17793552, + "step": 19670 + }, + { + "epoch": 5.192688399102547, + "grad_norm": 0.0016542632365599275, + "learning_rate": 0.15384017740670627, + "loss": 0.0485, + "num_input_tokens_seen": 17797936, + "step": 19675 + }, + { + "epoch": 5.194008182658044, + "grad_norm": 0.002912329975515604, + "learning_rate": 0.15378129155675602, + "loss": 0.0478, + "num_input_tokens_seen": 17802480, + "step": 19680 + }, + { + "epoch": 5.195327966213541, + "grad_norm": 0.0018605480436235666, + "learning_rate": 0.15372240512368307, + "loss": 0.1016, + "num_input_tokens_seen": 17806960, + "step": 19685 + }, + { + "epoch": 5.1966477497690375, + "grad_norm": 0.002211257116869092, + "learning_rate": 0.1536635181165684, + "loss": 0.0741, + "num_input_tokens_seen": 17811440, + "step": 19690 + }, + { + "epoch": 5.197967533324535, + "grad_norm": 0.00238100066781044, + "learning_rate": 0.15360463054449328, + "loss": 0.0691, + "num_input_tokens_seen": 17815856, + "step": 19695 + }, + { + "epoch": 5.1992873168800315, + "grad_norm": 0.0028826268389821053, + "learning_rate": 0.1535457424165388, + "loss": 0.0648, + "num_input_tokens_seen": 17820304, + "step": 19700 + }, + { + "epoch": 5.200607100435529, + "grad_norm": 0.0016716584796085954, + "learning_rate": 0.15348685374178628, + "loss": 0.0492, + "num_input_tokens_seen": 17824976, + "step": 19705 + }, + { + "epoch": 5.2019268839910255, + "grad_norm": 0.002208465477451682, + "learning_rate": 0.1534279645293171, + "loss": 0.0933, + "num_input_tokens_seen": 17829264, + "step": 19710 + }, + { + "epoch": 5.203246667546522, + "grad_norm": 0.003321991069242358, + "learning_rate": 0.1533690747882127, + "loss": 0.0868, + "num_input_tokens_seen": 17833680, + "step": 19715 + }, + { + "epoch": 5.2045664511020195, + "grad_norm": 0.0012724704574793577, + "learning_rate": 0.15331018452755465, + "loss": 0.0627, + "num_input_tokens_seen": 17838384, + "step": 19720 + }, + { + "epoch": 5.205886234657516, + "grad_norm": 0.0009724131668917835, + "learning_rate": 0.15325129375642457, + "loss": 0.0864, + "num_input_tokens_seen": 17842800, + "step": 19725 + }, + { + "epoch": 5.207206018213013, + "grad_norm": 0.0017342029605060816, + "learning_rate": 0.15319240248390406, + "loss": 0.0581, + "num_input_tokens_seen": 17847216, + "step": 19730 + }, + { + "epoch": 5.20852580176851, + "grad_norm": 0.002556441817432642, + "learning_rate": 0.153133510719075, + "loss": 0.0651, + "num_input_tokens_seen": 17851568, + "step": 19735 + }, + { + "epoch": 5.209845585324007, + "grad_norm": 0.0013150498270988464, + "learning_rate": 0.15307461847101922, + "loss": 0.0746, + "num_input_tokens_seen": 17856112, + "step": 19740 + }, + { + "epoch": 5.211165368879504, + "grad_norm": 0.003653924446552992, + "learning_rate": 0.15301572574881864, + "loss": 0.0522, + "num_input_tokens_seen": 17860816, + "step": 19745 + }, + { + "epoch": 5.212485152435001, + "grad_norm": 0.000956129573751241, + "learning_rate": 0.15295683256155523, + "loss": 0.0563, + "num_input_tokens_seen": 17864976, + "step": 19750 + }, + { + "epoch": 5.213804935990497, + "grad_norm": 0.002149691339582205, + "learning_rate": 0.15289793891831113, + "loss": 0.0633, + "num_input_tokens_seen": 17869232, + "step": 19755 + }, + { + "epoch": 5.215124719545995, + "grad_norm": 0.002956223441287875, + "learning_rate": 0.15283904482816837, + "loss": 0.0702, + "num_input_tokens_seen": 17873840, + "step": 19760 + }, + { + "epoch": 5.216444503101491, + "grad_norm": 0.002304049441590905, + "learning_rate": 0.15278015030020928, + "loss": 0.0591, + "num_input_tokens_seen": 17878608, + "step": 19765 + }, + { + "epoch": 5.217764286656989, + "grad_norm": 0.0014933205675333738, + "learning_rate": 0.152721255343516, + "loss": 0.1007, + "num_input_tokens_seen": 17883248, + "step": 19770 + }, + { + "epoch": 5.219084070212485, + "grad_norm": 0.0032565665896981955, + "learning_rate": 0.15266235996717098, + "loss": 0.0651, + "num_input_tokens_seen": 17887728, + "step": 19775 + }, + { + "epoch": 5.220403853767982, + "grad_norm": 0.001493057468906045, + "learning_rate": 0.15260346418025664, + "loss": 0.0456, + "num_input_tokens_seen": 17892688, + "step": 19780 + }, + { + "epoch": 5.221723637323479, + "grad_norm": 0.002421586774289608, + "learning_rate": 0.15254456799185537, + "loss": 0.0648, + "num_input_tokens_seen": 17897200, + "step": 19785 + }, + { + "epoch": 5.223043420878976, + "grad_norm": 0.0015986099606379867, + "learning_rate": 0.15248567141104974, + "loss": 0.0657, + "num_input_tokens_seen": 17901584, + "step": 19790 + }, + { + "epoch": 5.224363204434472, + "grad_norm": 0.004017834085971117, + "learning_rate": 0.15242677444692232, + "loss": 0.0589, + "num_input_tokens_seen": 17905872, + "step": 19795 + }, + { + "epoch": 5.22568298798997, + "grad_norm": 0.0027966133784502745, + "learning_rate": 0.15236787710855584, + "loss": 0.1146, + "num_input_tokens_seen": 17910128, + "step": 19800 + }, + { + "epoch": 5.22568298798997, + "eval_loss": 0.09016967564821243, + "eval_runtime": 75.909, + "eval_samples_per_second": 88.725, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 17910128, + "step": 19800 + }, + { + "epoch": 5.227002771545466, + "grad_norm": 0.0014597807312384248, + "learning_rate": 0.1523089794050329, + "loss": 0.0681, + "num_input_tokens_seen": 17914672, + "step": 19805 + }, + { + "epoch": 5.228322555100964, + "grad_norm": 0.0034494507126510143, + "learning_rate": 0.15225008134543633, + "loss": 0.0601, + "num_input_tokens_seen": 17919504, + "step": 19810 + }, + { + "epoch": 5.22964233865646, + "grad_norm": 0.0008471140754409134, + "learning_rate": 0.15219118293884895, + "loss": 0.0342, + "num_input_tokens_seen": 17924336, + "step": 19815 + }, + { + "epoch": 5.230962122211957, + "grad_norm": 0.002714337781071663, + "learning_rate": 0.15213228419435362, + "loss": 0.0569, + "num_input_tokens_seen": 17928976, + "step": 19820 + }, + { + "epoch": 5.232281905767454, + "grad_norm": 0.0015600319020450115, + "learning_rate": 0.15207338512103327, + "loss": 0.0493, + "num_input_tokens_seen": 17933328, + "step": 19825 + }, + { + "epoch": 5.233601689322951, + "grad_norm": 0.0015668582636862993, + "learning_rate": 0.1520144857279709, + "loss": 0.0493, + "num_input_tokens_seen": 17937680, + "step": 19830 + }, + { + "epoch": 5.234921472878448, + "grad_norm": 0.0011614831164479256, + "learning_rate": 0.1519555860242495, + "loss": 0.0552, + "num_input_tokens_seen": 17942288, + "step": 19835 + }, + { + "epoch": 5.236241256433945, + "grad_norm": 0.0017419499345123768, + "learning_rate": 0.15189668601895218, + "loss": 0.0808, + "num_input_tokens_seen": 17946864, + "step": 19840 + }, + { + "epoch": 5.2375610399894414, + "grad_norm": 0.0013962944503873587, + "learning_rate": 0.151837785721162, + "loss": 0.0574, + "num_input_tokens_seen": 17951408, + "step": 19845 + }, + { + "epoch": 5.238880823544939, + "grad_norm": 0.0037479314487427473, + "learning_rate": 0.15177888513996218, + "loss": 0.0819, + "num_input_tokens_seen": 17955824, + "step": 19850 + }, + { + "epoch": 5.2402006071004354, + "grad_norm": 0.003372778417542577, + "learning_rate": 0.15171998428443592, + "loss": 0.0718, + "num_input_tokens_seen": 17960368, + "step": 19855 + }, + { + "epoch": 5.241520390655932, + "grad_norm": 0.001810135436244309, + "learning_rate": 0.1516610831636665, + "loss": 0.0756, + "num_input_tokens_seen": 17964944, + "step": 19860 + }, + { + "epoch": 5.2428401742114294, + "grad_norm": 0.003967308439314365, + "learning_rate": 0.15160218178673715, + "loss": 0.0464, + "num_input_tokens_seen": 17969392, + "step": 19865 + }, + { + "epoch": 5.244159957766926, + "grad_norm": 0.0025125148240476847, + "learning_rate": 0.15154328016273122, + "loss": 0.0893, + "num_input_tokens_seen": 17974064, + "step": 19870 + }, + { + "epoch": 5.2454797413224235, + "grad_norm": 0.0012376331724226475, + "learning_rate": 0.1514843783007321, + "loss": 0.0398, + "num_input_tokens_seen": 17978480, + "step": 19875 + }, + { + "epoch": 5.24679952487792, + "grad_norm": 0.00243278662674129, + "learning_rate": 0.15142547620982322, + "loss": 0.0539, + "num_input_tokens_seen": 17982800, + "step": 19880 + }, + { + "epoch": 5.248119308433417, + "grad_norm": 0.0005589924985542893, + "learning_rate": 0.15136657389908797, + "loss": 0.0352, + "num_input_tokens_seen": 17987408, + "step": 19885 + }, + { + "epoch": 5.249439091988914, + "grad_norm": 0.0021883430890738964, + "learning_rate": 0.15130767137760986, + "loss": 0.0785, + "num_input_tokens_seen": 17991984, + "step": 19890 + }, + { + "epoch": 5.250758875544411, + "grad_norm": 0.001920756883919239, + "learning_rate": 0.15124876865447243, + "loss": 0.0365, + "num_input_tokens_seen": 17996368, + "step": 19895 + }, + { + "epoch": 5.252078659099908, + "grad_norm": 0.0006925527704879642, + "learning_rate": 0.15118986573875912, + "loss": 0.0635, + "num_input_tokens_seen": 18001008, + "step": 19900 + }, + { + "epoch": 5.253398442655405, + "grad_norm": 0.0023225743789225817, + "learning_rate": 0.15113096263955358, + "loss": 0.0632, + "num_input_tokens_seen": 18005488, + "step": 19905 + }, + { + "epoch": 5.254718226210901, + "grad_norm": 0.0016232311027124524, + "learning_rate": 0.1510720593659394, + "loss": 0.0671, + "num_input_tokens_seen": 18010320, + "step": 19910 + }, + { + "epoch": 5.256038009766399, + "grad_norm": 0.0004887236864306033, + "learning_rate": 0.15101315592700015, + "loss": 0.1174, + "num_input_tokens_seen": 18014768, + "step": 19915 + }, + { + "epoch": 5.257357793321895, + "grad_norm": 0.003651192644611001, + "learning_rate": 0.15095425233181956, + "loss": 0.0892, + "num_input_tokens_seen": 18018960, + "step": 19920 + }, + { + "epoch": 5.258677576877393, + "grad_norm": 0.002340351464226842, + "learning_rate": 0.15089534858948128, + "loss": 0.0558, + "num_input_tokens_seen": 18023408, + "step": 19925 + }, + { + "epoch": 5.259997360432889, + "grad_norm": 0.0011352875735610723, + "learning_rate": 0.15083644470906898, + "loss": 0.0613, + "num_input_tokens_seen": 18028016, + "step": 19930 + }, + { + "epoch": 5.261317143988386, + "grad_norm": 0.0027824316639453173, + "learning_rate": 0.1507775406996664, + "loss": 0.0913, + "num_input_tokens_seen": 18032272, + "step": 19935 + }, + { + "epoch": 5.262636927543883, + "grad_norm": 0.0017036934150382876, + "learning_rate": 0.15071863657035725, + "loss": 0.0524, + "num_input_tokens_seen": 18036848, + "step": 19940 + }, + { + "epoch": 5.26395671109938, + "grad_norm": 0.0005247584776952863, + "learning_rate": 0.15065973233022534, + "loss": 0.0428, + "num_input_tokens_seen": 18041360, + "step": 19945 + }, + { + "epoch": 5.265276494654876, + "grad_norm": 0.00034709181636571884, + "learning_rate": 0.15060082798835442, + "loss": 0.0529, + "num_input_tokens_seen": 18045712, + "step": 19950 + }, + { + "epoch": 5.266596278210374, + "grad_norm": 0.0009554639109410346, + "learning_rate": 0.15054192355382823, + "loss": 0.0596, + "num_input_tokens_seen": 18050288, + "step": 19955 + }, + { + "epoch": 5.26791606176587, + "grad_norm": 0.00160784216132015, + "learning_rate": 0.15048301903573066, + "loss": 0.0511, + "num_input_tokens_seen": 18054672, + "step": 19960 + }, + { + "epoch": 5.269235845321368, + "grad_norm": 0.0028343014419078827, + "learning_rate": 0.15042411444314546, + "loss": 0.1004, + "num_input_tokens_seen": 18059024, + "step": 19965 + }, + { + "epoch": 5.270555628876864, + "grad_norm": 0.0028554366435855627, + "learning_rate": 0.1503652097851565, + "loss": 0.0449, + "num_input_tokens_seen": 18063696, + "step": 19970 + }, + { + "epoch": 5.271875412432361, + "grad_norm": 0.0033703071530908346, + "learning_rate": 0.15030630507084758, + "loss": 0.1666, + "num_input_tokens_seen": 18067984, + "step": 19975 + }, + { + "epoch": 5.273195195987858, + "grad_norm": 0.0014590246137231588, + "learning_rate": 0.1502474003093026, + "loss": 0.0581, + "num_input_tokens_seen": 18072816, + "step": 19980 + }, + { + "epoch": 5.274514979543355, + "grad_norm": 0.001511604175902903, + "learning_rate": 0.15018849550960536, + "loss": 0.0491, + "num_input_tokens_seen": 18077296, + "step": 19985 + }, + { + "epoch": 5.275834763098851, + "grad_norm": 0.002677131211385131, + "learning_rate": 0.15012959068083975, + "loss": 0.0665, + "num_input_tokens_seen": 18081840, + "step": 19990 + }, + { + "epoch": 5.277154546654349, + "grad_norm": 0.0032331806141883135, + "learning_rate": 0.1500706858320896, + "loss": 0.0734, + "num_input_tokens_seen": 18086384, + "step": 19995 + }, + { + "epoch": 5.278474330209845, + "grad_norm": 0.003992629703134298, + "learning_rate": 0.15001178097243886, + "loss": 0.0553, + "num_input_tokens_seen": 18091056, + "step": 20000 + }, + { + "epoch": 5.278474330209845, + "eval_loss": 0.08903956413269043, + "eval_runtime": 75.9204, + "eval_samples_per_second": 88.711, + "eval_steps_per_second": 22.181, + "num_input_tokens_seen": 18091056, + "step": 20000 + }, + { + "epoch": 5.279794113765343, + "grad_norm": 0.0017598745180293918, + "learning_rate": 0.1499528761109713, + "loss": 0.0453, + "num_input_tokens_seen": 18095696, + "step": 20005 + }, + { + "epoch": 5.281113897320839, + "grad_norm": 0.0003334212233312428, + "learning_rate": 0.14989397125677087, + "loss": 0.042, + "num_input_tokens_seen": 18100368, + "step": 20010 + }, + { + "epoch": 5.282433680876336, + "grad_norm": 0.001075221924111247, + "learning_rate": 0.14983506641892141, + "loss": 0.0583, + "num_input_tokens_seen": 18105136, + "step": 20015 + }, + { + "epoch": 5.283753464431833, + "grad_norm": 0.002621104009449482, + "learning_rate": 0.14977616160650672, + "loss": 0.0771, + "num_input_tokens_seen": 18109712, + "step": 20020 + }, + { + "epoch": 5.28507324798733, + "grad_norm": 0.0019560735672712326, + "learning_rate": 0.14971725682861076, + "loss": 0.0349, + "num_input_tokens_seen": 18114480, + "step": 20025 + }, + { + "epoch": 5.286393031542827, + "grad_norm": 0.0025913978461176157, + "learning_rate": 0.14965835209431738, + "loss": 0.1083, + "num_input_tokens_seen": 18119024, + "step": 20030 + }, + { + "epoch": 5.287712815098324, + "grad_norm": 0.0015628935070708394, + "learning_rate": 0.14959944741271036, + "loss": 0.0878, + "num_input_tokens_seen": 18123472, + "step": 20035 + }, + { + "epoch": 5.2890325986538205, + "grad_norm": 0.000861894863191992, + "learning_rate": 0.14954054279287363, + "loss": 0.0647, + "num_input_tokens_seen": 18127984, + "step": 20040 + }, + { + "epoch": 5.290352382209318, + "grad_norm": 0.00196414184756577, + "learning_rate": 0.14948163824389094, + "loss": 0.0818, + "num_input_tokens_seen": 18132496, + "step": 20045 + }, + { + "epoch": 5.2916721657648145, + "grad_norm": 0.0016208745073527098, + "learning_rate": 0.14942273377484613, + "loss": 0.0594, + "num_input_tokens_seen": 18136944, + "step": 20050 + }, + { + "epoch": 5.292991949320312, + "grad_norm": 0.00252875336445868, + "learning_rate": 0.1493638293948231, + "loss": 0.0665, + "num_input_tokens_seen": 18141360, + "step": 20055 + }, + { + "epoch": 5.2943117328758085, + "grad_norm": 0.0024156998842954636, + "learning_rate": 0.14930492511290547, + "loss": 0.0838, + "num_input_tokens_seen": 18146160, + "step": 20060 + }, + { + "epoch": 5.295631516431305, + "grad_norm": 0.0019292812794446945, + "learning_rate": 0.14924602093817715, + "loss": 0.0764, + "num_input_tokens_seen": 18150480, + "step": 20065 + }, + { + "epoch": 5.2969512999868025, + "grad_norm": 0.0024212270509451628, + "learning_rate": 0.14918711687972194, + "loss": 0.0392, + "num_input_tokens_seen": 18154864, + "step": 20070 + }, + { + "epoch": 5.298271083542299, + "grad_norm": 0.0009442914160899818, + "learning_rate": 0.14912821294662346, + "loss": 0.0689, + "num_input_tokens_seen": 18159504, + "step": 20075 + }, + { + "epoch": 5.299590867097796, + "grad_norm": 0.001839929260313511, + "learning_rate": 0.14906930914796554, + "loss": 0.0646, + "num_input_tokens_seen": 18163536, + "step": 20080 + }, + { + "epoch": 5.300910650653293, + "grad_norm": 0.0012705452973023057, + "learning_rate": 0.14901040549283182, + "loss": 0.0477, + "num_input_tokens_seen": 18167760, + "step": 20085 + }, + { + "epoch": 5.30223043420879, + "grad_norm": 0.0015548218507319689, + "learning_rate": 0.148951501990306, + "loss": 0.0804, + "num_input_tokens_seen": 18172528, + "step": 20090 + }, + { + "epoch": 5.303550217764287, + "grad_norm": 0.0017012007301673293, + "learning_rate": 0.14889259864947177, + "loss": 0.0502, + "num_input_tokens_seen": 18177456, + "step": 20095 + }, + { + "epoch": 5.304870001319784, + "grad_norm": 0.0018614097498357296, + "learning_rate": 0.14883369547941272, + "loss": 0.0786, + "num_input_tokens_seen": 18181936, + "step": 20100 + }, + { + "epoch": 5.30618978487528, + "grad_norm": 0.002196601824834943, + "learning_rate": 0.14877479248921247, + "loss": 0.0491, + "num_input_tokens_seen": 18186448, + "step": 20105 + }, + { + "epoch": 5.307509568430778, + "grad_norm": 0.0030521200969815254, + "learning_rate": 0.14871588968795468, + "loss": 0.0569, + "num_input_tokens_seen": 18190928, + "step": 20110 + }, + { + "epoch": 5.308829351986274, + "grad_norm": 0.002162126824259758, + "learning_rate": 0.1486569870847228, + "loss": 0.0814, + "num_input_tokens_seen": 18195408, + "step": 20115 + }, + { + "epoch": 5.310149135541771, + "grad_norm": 0.0019901252817362547, + "learning_rate": 0.1485980846886004, + "loss": 0.0588, + "num_input_tokens_seen": 18199984, + "step": 20120 + }, + { + "epoch": 5.311468919097268, + "grad_norm": 0.000921361381188035, + "learning_rate": 0.14853918250867096, + "loss": 0.033, + "num_input_tokens_seen": 18204592, + "step": 20125 + }, + { + "epoch": 5.312788702652765, + "grad_norm": 0.0025480049662292004, + "learning_rate": 0.1484802805540179, + "loss": 0.0352, + "num_input_tokens_seen": 18209040, + "step": 20130 + }, + { + "epoch": 5.314108486208262, + "grad_norm": 0.0018844996811822057, + "learning_rate": 0.14842137883372472, + "loss": 0.0673, + "num_input_tokens_seen": 18213392, + "step": 20135 + }, + { + "epoch": 5.315428269763759, + "grad_norm": 0.005234970711171627, + "learning_rate": 0.14836247735687474, + "loss": 0.0662, + "num_input_tokens_seen": 18217840, + "step": 20140 + }, + { + "epoch": 5.316748053319255, + "grad_norm": 0.0014246325008571148, + "learning_rate": 0.14830357613255132, + "loss": 0.0623, + "num_input_tokens_seen": 18222096, + "step": 20145 + }, + { + "epoch": 5.318067836874753, + "grad_norm": 0.004863048437982798, + "learning_rate": 0.1482446751698378, + "loss": 0.0875, + "num_input_tokens_seen": 18226320, + "step": 20150 + }, + { + "epoch": 5.319387620430249, + "grad_norm": 0.0004349609662313014, + "learning_rate": 0.14818577447781744, + "loss": 0.0319, + "num_input_tokens_seen": 18230800, + "step": 20155 + }, + { + "epoch": 5.320707403985747, + "grad_norm": 0.002440724289044738, + "learning_rate": 0.14812687406557346, + "loss": 0.0665, + "num_input_tokens_seen": 18235376, + "step": 20160 + }, + { + "epoch": 5.322027187541243, + "grad_norm": 0.0031786938197910786, + "learning_rate": 0.14806797394218899, + "loss": 0.0828, + "num_input_tokens_seen": 18239920, + "step": 20165 + }, + { + "epoch": 5.32334697109674, + "grad_norm": 0.0017599855782464147, + "learning_rate": 0.1480090741167472, + "loss": 0.0248, + "num_input_tokens_seen": 18244336, + "step": 20170 + }, + { + "epoch": 5.324666754652237, + "grad_norm": 0.0012216256000101566, + "learning_rate": 0.1479501745983313, + "loss": 0.096, + "num_input_tokens_seen": 18249040, + "step": 20175 + }, + { + "epoch": 5.325986538207734, + "grad_norm": 0.0013517317129299045, + "learning_rate": 0.14789127539602415, + "loss": 0.071, + "num_input_tokens_seen": 18253456, + "step": 20180 + }, + { + "epoch": 5.327306321763231, + "grad_norm": 0.002460081595927477, + "learning_rate": 0.14783237651890885, + "loss": 0.0936, + "num_input_tokens_seen": 18257840, + "step": 20185 + }, + { + "epoch": 5.328626105318728, + "grad_norm": 0.0036620739847421646, + "learning_rate": 0.14777347797606838, + "loss": 0.0945, + "num_input_tokens_seen": 18262576, + "step": 20190 + }, + { + "epoch": 5.329945888874224, + "grad_norm": 0.0010465896921232343, + "learning_rate": 0.14771457977658553, + "loss": 0.0408, + "num_input_tokens_seen": 18266864, + "step": 20195 + }, + { + "epoch": 5.331265672429722, + "grad_norm": 0.0008695703581906855, + "learning_rate": 0.14765568192954326, + "loss": 0.0548, + "num_input_tokens_seen": 18271440, + "step": 20200 + }, + { + "epoch": 5.331265672429722, + "eval_loss": 0.09063807874917984, + "eval_runtime": 75.8809, + "eval_samples_per_second": 88.758, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 18271440, + "step": 20200 + }, + { + "epoch": 5.332585455985218, + "grad_norm": 0.0035819425247609615, + "learning_rate": 0.14759678444402421, + "loss": 0.0984, + "num_input_tokens_seen": 18275760, + "step": 20205 + }, + { + "epoch": 5.333905239540715, + "grad_norm": 0.0029523198027163744, + "learning_rate": 0.14753788732911122, + "loss": 0.085, + "num_input_tokens_seen": 18280208, + "step": 20210 + }, + { + "epoch": 5.335225023096212, + "grad_norm": 0.0005807215347886086, + "learning_rate": 0.147478990593887, + "loss": 0.041, + "num_input_tokens_seen": 18284592, + "step": 20215 + }, + { + "epoch": 5.336544806651709, + "grad_norm": 0.00235072267241776, + "learning_rate": 0.14742009424743405, + "loss": 0.1096, + "num_input_tokens_seen": 18288976, + "step": 20220 + }, + { + "epoch": 5.337864590207206, + "grad_norm": 0.0032932606991380453, + "learning_rate": 0.14736119829883504, + "loss": 0.0609, + "num_input_tokens_seen": 18293520, + "step": 20225 + }, + { + "epoch": 5.339184373762703, + "grad_norm": 0.0007604158017784357, + "learning_rate": 0.14730230275717243, + "loss": 0.0603, + "num_input_tokens_seen": 18297904, + "step": 20230 + }, + { + "epoch": 5.3405041573181995, + "grad_norm": 0.000985300517641008, + "learning_rate": 0.14724340763152854, + "loss": 0.0452, + "num_input_tokens_seen": 18302512, + "step": 20235 + }, + { + "epoch": 5.341823940873697, + "grad_norm": 0.0013470706762745976, + "learning_rate": 0.14718451293098594, + "loss": 0.0894, + "num_input_tokens_seen": 18306704, + "step": 20240 + }, + { + "epoch": 5.3431437244291935, + "grad_norm": 0.0023244046606123447, + "learning_rate": 0.14712561866462676, + "loss": 0.0697, + "num_input_tokens_seen": 18311152, + "step": 20245 + }, + { + "epoch": 5.34446350798469, + "grad_norm": 0.0016931549180299044, + "learning_rate": 0.1470667248415333, + "loss": 0.049, + "num_input_tokens_seen": 18315728, + "step": 20250 + }, + { + "epoch": 5.3457832915401875, + "grad_norm": 0.0012649968266487122, + "learning_rate": 0.1470078314707878, + "loss": 0.0735, + "num_input_tokens_seen": 18320144, + "step": 20255 + }, + { + "epoch": 5.347103075095684, + "grad_norm": 0.000999724492430687, + "learning_rate": 0.14694893856147223, + "loss": 0.0555, + "num_input_tokens_seen": 18324624, + "step": 20260 + }, + { + "epoch": 5.3484228586511815, + "grad_norm": 0.0021633480209857225, + "learning_rate": 0.14689004612266868, + "loss": 0.0394, + "num_input_tokens_seen": 18329008, + "step": 20265 + }, + { + "epoch": 5.349742642206678, + "grad_norm": 0.0023410096764564514, + "learning_rate": 0.14683115416345913, + "loss": 0.0639, + "num_input_tokens_seen": 18333360, + "step": 20270 + }, + { + "epoch": 5.351062425762175, + "grad_norm": 0.002484993077814579, + "learning_rate": 0.1467722626929254, + "loss": 0.0658, + "num_input_tokens_seen": 18337840, + "step": 20275 + }, + { + "epoch": 5.352382209317672, + "grad_norm": 0.0011217199498787522, + "learning_rate": 0.14671337172014937, + "loss": 0.0776, + "num_input_tokens_seen": 18342320, + "step": 20280 + }, + { + "epoch": 5.353701992873169, + "grad_norm": 0.0016291221836581826, + "learning_rate": 0.14665448125421265, + "loss": 0.073, + "num_input_tokens_seen": 18346896, + "step": 20285 + }, + { + "epoch": 5.355021776428666, + "grad_norm": 0.0011217518476769328, + "learning_rate": 0.146595591304197, + "loss": 0.095, + "num_input_tokens_seen": 18351152, + "step": 20290 + }, + { + "epoch": 5.356341559984163, + "grad_norm": 0.0019117436604574323, + "learning_rate": 0.14653670187918397, + "loss": 0.0923, + "num_input_tokens_seen": 18355632, + "step": 20295 + }, + { + "epoch": 5.357661343539659, + "grad_norm": 0.0021987587679177523, + "learning_rate": 0.14647781298825502, + "loss": 0.0696, + "num_input_tokens_seen": 18360080, + "step": 20300 + }, + { + "epoch": 5.358981127095157, + "grad_norm": 0.001379017485305667, + "learning_rate": 0.14641892464049153, + "loss": 0.0408, + "num_input_tokens_seen": 18364816, + "step": 20305 + }, + { + "epoch": 5.360300910650653, + "grad_norm": 0.001775288605131209, + "learning_rate": 0.14636003684497495, + "loss": 0.0825, + "num_input_tokens_seen": 18369264, + "step": 20310 + }, + { + "epoch": 5.361620694206151, + "grad_norm": 0.0028647207655012608, + "learning_rate": 0.14630114961078636, + "loss": 0.0648, + "num_input_tokens_seen": 18373520, + "step": 20315 + }, + { + "epoch": 5.362940477761647, + "grad_norm": 0.0019264916190877557, + "learning_rate": 0.14624226294700704, + "loss": 0.0615, + "num_input_tokens_seen": 18377744, + "step": 20320 + }, + { + "epoch": 5.364260261317144, + "grad_norm": 0.0017333085415884852, + "learning_rate": 0.14618337686271793, + "loss": 0.0703, + "num_input_tokens_seen": 18382096, + "step": 20325 + }, + { + "epoch": 5.365580044872641, + "grad_norm": 0.0014916709624230862, + "learning_rate": 0.1461244913670001, + "loss": 0.0538, + "num_input_tokens_seen": 18386832, + "step": 20330 + }, + { + "epoch": 5.366899828428138, + "grad_norm": 0.0008689549867995083, + "learning_rate": 0.1460656064689344, + "loss": 0.0412, + "num_input_tokens_seen": 18391376, + "step": 20335 + }, + { + "epoch": 5.368219611983634, + "grad_norm": 0.0021885314490646124, + "learning_rate": 0.14600672217760163, + "loss": 0.0756, + "num_input_tokens_seen": 18395920, + "step": 20340 + }, + { + "epoch": 5.369539395539132, + "grad_norm": 0.002235347405076027, + "learning_rate": 0.14594783850208248, + "loss": 0.0542, + "num_input_tokens_seen": 18400336, + "step": 20345 + }, + { + "epoch": 5.370859179094628, + "grad_norm": 0.0015627797693014145, + "learning_rate": 0.14588895545145758, + "loss": 0.0668, + "num_input_tokens_seen": 18404816, + "step": 20350 + }, + { + "epoch": 5.372178962650126, + "grad_norm": 0.003485820023342967, + "learning_rate": 0.14583007303480738, + "loss": 0.0795, + "num_input_tokens_seen": 18409680, + "step": 20355 + }, + { + "epoch": 5.373498746205622, + "grad_norm": 0.0004984315019100904, + "learning_rate": 0.14577119126121235, + "loss": 0.0422, + "num_input_tokens_seen": 18414128, + "step": 20360 + }, + { + "epoch": 5.374818529761119, + "grad_norm": 0.0022593755275011063, + "learning_rate": 0.14571231013975272, + "loss": 0.0683, + "num_input_tokens_seen": 18418544, + "step": 20365 + }, + { + "epoch": 5.376138313316616, + "grad_norm": 0.0025496038142591715, + "learning_rate": 0.1456534296795088, + "loss": 0.0731, + "num_input_tokens_seen": 18423056, + "step": 20370 + }, + { + "epoch": 5.377458096872113, + "grad_norm": 0.0028527919203042984, + "learning_rate": 0.14559454988956066, + "loss": 0.0494, + "num_input_tokens_seen": 18427600, + "step": 20375 + }, + { + "epoch": 5.378777880427609, + "grad_norm": 0.0025820701848715544, + "learning_rate": 0.1455356707789882, + "loss": 0.0809, + "num_input_tokens_seen": 18432176, + "step": 20380 + }, + { + "epoch": 5.380097663983107, + "grad_norm": 0.0016683867434039712, + "learning_rate": 0.14547679235687147, + "loss": 0.0533, + "num_input_tokens_seen": 18436880, + "step": 20385 + }, + { + "epoch": 5.381417447538603, + "grad_norm": 0.0017611415823921561, + "learning_rate": 0.14541791463229023, + "loss": 0.058, + "num_input_tokens_seen": 18441680, + "step": 20390 + }, + { + "epoch": 5.382737231094101, + "grad_norm": 0.0015323369298130274, + "learning_rate": 0.14535903761432406, + "loss": 0.0913, + "num_input_tokens_seen": 18445776, + "step": 20395 + }, + { + "epoch": 5.384057014649597, + "grad_norm": 0.0016958954511210322, + "learning_rate": 0.1453001613120527, + "loss": 0.1047, + "num_input_tokens_seen": 18450832, + "step": 20400 + }, + { + "epoch": 5.384057014649597, + "eval_loss": 0.08677903562784195, + "eval_runtime": 76.0634, + "eval_samples_per_second": 88.545, + "eval_steps_per_second": 22.139, + "num_input_tokens_seen": 18450832, + "step": 20400 + }, + { + "epoch": 5.385376798205094, + "grad_norm": 0.0008978097466751933, + "learning_rate": 0.14524128573455547, + "loss": 0.0571, + "num_input_tokens_seen": 18455280, + "step": 20405 + }, + { + "epoch": 5.386696581760591, + "grad_norm": 0.000905179011169821, + "learning_rate": 0.14518241089091177, + "loss": 0.0441, + "num_input_tokens_seen": 18460048, + "step": 20410 + }, + { + "epoch": 5.388016365316088, + "grad_norm": 0.002121978672221303, + "learning_rate": 0.1451235367902009, + "loss": 0.0401, + "num_input_tokens_seen": 18464624, + "step": 20415 + }, + { + "epoch": 5.389336148871585, + "grad_norm": 0.001106569543480873, + "learning_rate": 0.1450646634415019, + "loss": 0.0799, + "num_input_tokens_seen": 18469072, + "step": 20420 + }, + { + "epoch": 5.390655932427082, + "grad_norm": 0.0010837317677214742, + "learning_rate": 0.1450057908538938, + "loss": 0.0779, + "num_input_tokens_seen": 18473424, + "step": 20425 + }, + { + "epoch": 5.3919757159825785, + "grad_norm": 0.00132027140352875, + "learning_rate": 0.14494691903645557, + "loss": 0.0431, + "num_input_tokens_seen": 18477968, + "step": 20430 + }, + { + "epoch": 5.393295499538076, + "grad_norm": 0.0010240190895274282, + "learning_rate": 0.14488804799826588, + "loss": 0.0778, + "num_input_tokens_seen": 18482416, + "step": 20435 + }, + { + "epoch": 5.3946152830935725, + "grad_norm": 0.001725651789456606, + "learning_rate": 0.14482917774840348, + "loss": 0.0825, + "num_input_tokens_seen": 18486864, + "step": 20440 + }, + { + "epoch": 5.39593506664907, + "grad_norm": 0.0012354630744084716, + "learning_rate": 0.14477030829594684, + "loss": 0.0802, + "num_input_tokens_seen": 18491376, + "step": 20445 + }, + { + "epoch": 5.3972548502045665, + "grad_norm": 0.005115749314427376, + "learning_rate": 0.14471143964997432, + "loss": 0.1287, + "num_input_tokens_seen": 18496080, + "step": 20450 + }, + { + "epoch": 5.398574633760063, + "grad_norm": 0.0022228979505598545, + "learning_rate": 0.14465257181956434, + "loss": 0.1216, + "num_input_tokens_seen": 18500624, + "step": 20455 + }, + { + "epoch": 5.3998944173155605, + "grad_norm": 0.0021722593810409307, + "learning_rate": 0.1445937048137949, + "loss": 0.0534, + "num_input_tokens_seen": 18504976, + "step": 20460 + }, + { + "epoch": 5.401214200871057, + "grad_norm": 0.0018339736852794886, + "learning_rate": 0.14453483864174416, + "loss": 0.0407, + "num_input_tokens_seen": 18509488, + "step": 20465 + }, + { + "epoch": 5.402533984426554, + "grad_norm": 0.0010717579862102866, + "learning_rate": 0.14447597331249, + "loss": 0.0678, + "num_input_tokens_seen": 18513808, + "step": 20470 + }, + { + "epoch": 5.403853767982051, + "grad_norm": 0.0016611682949587703, + "learning_rate": 0.1444171088351102, + "loss": 0.0675, + "num_input_tokens_seen": 18518192, + "step": 20475 + }, + { + "epoch": 5.405173551537548, + "grad_norm": 0.0018017591210082173, + "learning_rate": 0.14435824521868235, + "loss": 0.0792, + "num_input_tokens_seen": 18522864, + "step": 20480 + }, + { + "epoch": 5.406493335093045, + "grad_norm": 0.002402413170784712, + "learning_rate": 0.14429938247228397, + "loss": 0.0834, + "num_input_tokens_seen": 18527600, + "step": 20485 + }, + { + "epoch": 5.407813118648542, + "grad_norm": 0.002319303574040532, + "learning_rate": 0.14424052060499243, + "loss": 0.0552, + "num_input_tokens_seen": 18532304, + "step": 20490 + }, + { + "epoch": 5.409132902204038, + "grad_norm": 0.002508410019800067, + "learning_rate": 0.14418165962588506, + "loss": 0.0756, + "num_input_tokens_seen": 18536688, + "step": 20495 + }, + { + "epoch": 5.410452685759536, + "grad_norm": 0.0012177275493741035, + "learning_rate": 0.1441227995440388, + "loss": 0.0358, + "num_input_tokens_seen": 18541392, + "step": 20500 + }, + { + "epoch": 5.411772469315032, + "grad_norm": 0.0001792199327610433, + "learning_rate": 0.14406394036853082, + "loss": 0.0605, + "num_input_tokens_seen": 18545872, + "step": 20505 + }, + { + "epoch": 5.41309225287053, + "grad_norm": 0.002632923424243927, + "learning_rate": 0.14400508210843774, + "loss": 0.0466, + "num_input_tokens_seen": 18550672, + "step": 20510 + }, + { + "epoch": 5.414412036426026, + "grad_norm": 0.0039216079749166965, + "learning_rate": 0.1439462247728364, + "loss": 0.1355, + "num_input_tokens_seen": 18555568, + "step": 20515 + }, + { + "epoch": 5.415731819981523, + "grad_norm": 0.0006771240732632577, + "learning_rate": 0.14388736837080326, + "loss": 0.0497, + "num_input_tokens_seen": 18560016, + "step": 20520 + }, + { + "epoch": 5.41705160353702, + "grad_norm": 0.001733608776703477, + "learning_rate": 0.14382851291141469, + "loss": 0.0789, + "num_input_tokens_seen": 18564464, + "step": 20525 + }, + { + "epoch": 5.418371387092517, + "grad_norm": 0.0030106559861451387, + "learning_rate": 0.14376965840374697, + "loss": 0.0707, + "num_input_tokens_seen": 18568944, + "step": 20530 + }, + { + "epoch": 5.419691170648013, + "grad_norm": 0.001542953890748322, + "learning_rate": 0.14371080485687632, + "loss": 0.0529, + "num_input_tokens_seen": 18573840, + "step": 20535 + }, + { + "epoch": 5.421010954203511, + "grad_norm": 0.0011335433227941394, + "learning_rate": 0.1436519522798785, + "loss": 0.0649, + "num_input_tokens_seen": 18578224, + "step": 20540 + }, + { + "epoch": 5.422330737759007, + "grad_norm": 0.002224937779828906, + "learning_rate": 0.14359310068182948, + "loss": 0.0678, + "num_input_tokens_seen": 18582512, + "step": 20545 + }, + { + "epoch": 5.423650521314505, + "grad_norm": 0.002785845659673214, + "learning_rate": 0.14353425007180484, + "loss": 0.052, + "num_input_tokens_seen": 18587088, + "step": 20550 + }, + { + "epoch": 5.424970304870001, + "grad_norm": 0.002521795453503728, + "learning_rate": 0.14347540045888005, + "loss": 0.09, + "num_input_tokens_seen": 18591664, + "step": 20555 + }, + { + "epoch": 5.426290088425498, + "grad_norm": 0.0014134114608168602, + "learning_rate": 0.14341655185213056, + "loss": 0.0532, + "num_input_tokens_seen": 18596048, + "step": 20560 + }, + { + "epoch": 5.427609871980995, + "grad_norm": 0.00292633892968297, + "learning_rate": 0.14335770426063144, + "loss": 0.0678, + "num_input_tokens_seen": 18600624, + "step": 20565 + }, + { + "epoch": 5.428929655536492, + "grad_norm": 0.0026455663610249758, + "learning_rate": 0.1432988576934578, + "loss": 0.0907, + "num_input_tokens_seen": 18605104, + "step": 20570 + }, + { + "epoch": 5.430249439091989, + "grad_norm": 0.0035927544813603163, + "learning_rate": 0.14324001215968457, + "loss": 0.0902, + "num_input_tokens_seen": 18609680, + "step": 20575 + }, + { + "epoch": 5.431569222647486, + "grad_norm": 0.002905067289248109, + "learning_rate": 0.14318116766838637, + "loss": 0.0546, + "num_input_tokens_seen": 18614192, + "step": 20580 + }, + { + "epoch": 5.432889006202982, + "grad_norm": 0.0015720715746283531, + "learning_rate": 0.14312232422863788, + "loss": 0.0568, + "num_input_tokens_seen": 18618736, + "step": 20585 + }, + { + "epoch": 5.43420878975848, + "grad_norm": 0.0032013896852731705, + "learning_rate": 0.14306348184951334, + "loss": 0.0814, + "num_input_tokens_seen": 18623120, + "step": 20590 + }, + { + "epoch": 5.435528573313976, + "grad_norm": 0.0009712917380966246, + "learning_rate": 0.1430046405400871, + "loss": 0.0586, + "num_input_tokens_seen": 18627664, + "step": 20595 + }, + { + "epoch": 5.436848356869474, + "grad_norm": 0.0015975935384631157, + "learning_rate": 0.14294580030943324, + "loss": 0.0931, + "num_input_tokens_seen": 18632304, + "step": 20600 + }, + { + "epoch": 5.436848356869474, + "eval_loss": 0.08660506457090378, + "eval_runtime": 75.91, + "eval_samples_per_second": 88.724, + "eval_steps_per_second": 22.184, + "num_input_tokens_seen": 18632304, + "step": 20600 + }, + { + "epoch": 5.43816814042497, + "grad_norm": 0.0014104985166341066, + "learning_rate": 0.14288696116662553, + "loss": 0.0962, + "num_input_tokens_seen": 18637104, + "step": 20605 + }, + { + "epoch": 5.439487923980467, + "grad_norm": 0.001673887250944972, + "learning_rate": 0.1428281231207378, + "loss": 0.084, + "num_input_tokens_seen": 18641840, + "step": 20610 + }, + { + "epoch": 5.440807707535964, + "grad_norm": 0.0019287728937342763, + "learning_rate": 0.1427692861808437, + "loss": 0.0651, + "num_input_tokens_seen": 18646480, + "step": 20615 + }, + { + "epoch": 5.442127491091461, + "grad_norm": 0.0007541290251538157, + "learning_rate": 0.1427104503560165, + "loss": 0.0421, + "num_input_tokens_seen": 18651248, + "step": 20620 + }, + { + "epoch": 5.4434472746469575, + "grad_norm": 0.0016660081455484033, + "learning_rate": 0.14265161565532947, + "loss": 0.0556, + "num_input_tokens_seen": 18655952, + "step": 20625 + }, + { + "epoch": 5.444767058202455, + "grad_norm": 0.0038698683492839336, + "learning_rate": 0.14259278208785564, + "loss": 0.0773, + "num_input_tokens_seen": 18660400, + "step": 20630 + }, + { + "epoch": 5.4460868417579515, + "grad_norm": 0.0012774656061083078, + "learning_rate": 0.14253394966266789, + "loss": 0.0424, + "num_input_tokens_seen": 18665136, + "step": 20635 + }, + { + "epoch": 5.447406625313449, + "grad_norm": 0.00243078893981874, + "learning_rate": 0.14247511838883894, + "loss": 0.0793, + "num_input_tokens_seen": 18669744, + "step": 20640 + }, + { + "epoch": 5.4487264088689455, + "grad_norm": 0.002716449787840247, + "learning_rate": 0.14241628827544126, + "loss": 0.0979, + "num_input_tokens_seen": 18674448, + "step": 20645 + }, + { + "epoch": 5.450046192424442, + "grad_norm": 0.0023683207109570503, + "learning_rate": 0.14235745933154723, + "loss": 0.0632, + "num_input_tokens_seen": 18678832, + "step": 20650 + }, + { + "epoch": 5.4513659759799395, + "grad_norm": 0.0034757964313030243, + "learning_rate": 0.14229863156622907, + "loss": 0.0678, + "num_input_tokens_seen": 18683312, + "step": 20655 + }, + { + "epoch": 5.452685759535436, + "grad_norm": 0.001613269792869687, + "learning_rate": 0.14223980498855868, + "loss": 0.0663, + "num_input_tokens_seen": 18687920, + "step": 20660 + }, + { + "epoch": 5.454005543090933, + "grad_norm": 0.0022741835564374924, + "learning_rate": 0.14218097960760792, + "loss": 0.0725, + "num_input_tokens_seen": 18692336, + "step": 20665 + }, + { + "epoch": 5.45532532664643, + "grad_norm": 0.0019496975000947714, + "learning_rate": 0.1421221554324483, + "loss": 0.0407, + "num_input_tokens_seen": 18696560, + "step": 20670 + }, + { + "epoch": 5.456645110201927, + "grad_norm": 0.0030952375382184982, + "learning_rate": 0.1420633324721513, + "loss": 0.0446, + "num_input_tokens_seen": 18701232, + "step": 20675 + }, + { + "epoch": 5.457964893757424, + "grad_norm": 0.0021406332962214947, + "learning_rate": 0.14200451073578824, + "loss": 0.0365, + "num_input_tokens_seen": 18705616, + "step": 20680 + }, + { + "epoch": 5.459284677312921, + "grad_norm": 0.002039713552221656, + "learning_rate": 0.14194569023243003, + "loss": 0.0654, + "num_input_tokens_seen": 18710224, + "step": 20685 + }, + { + "epoch": 5.460604460868417, + "grad_norm": 0.0025649634189903736, + "learning_rate": 0.14188687097114766, + "loss": 0.0508, + "num_input_tokens_seen": 18714672, + "step": 20690 + }, + { + "epoch": 5.461924244423915, + "grad_norm": 0.0010219496907666326, + "learning_rate": 0.14182805296101172, + "loss": 0.0902, + "num_input_tokens_seen": 18719184, + "step": 20695 + }, + { + "epoch": 5.463244027979411, + "grad_norm": 0.003845998551696539, + "learning_rate": 0.14176923621109272, + "loss": 0.0672, + "num_input_tokens_seen": 18723728, + "step": 20700 + }, + { + "epoch": 5.464563811534909, + "grad_norm": 0.0031672250479459763, + "learning_rate": 0.14171042073046097, + "loss": 0.0512, + "num_input_tokens_seen": 18728144, + "step": 20705 + }, + { + "epoch": 5.465883595090405, + "grad_norm": 0.0015591540141031146, + "learning_rate": 0.14165160652818642, + "loss": 0.044, + "num_input_tokens_seen": 18732528, + "step": 20710 + }, + { + "epoch": 5.467203378645902, + "grad_norm": 0.004234136547893286, + "learning_rate": 0.14159279361333907, + "loss": 0.0659, + "num_input_tokens_seen": 18737232, + "step": 20715 + }, + { + "epoch": 5.468523162201399, + "grad_norm": 0.0016630194149911404, + "learning_rate": 0.14153398199498868, + "loss": 0.0996, + "num_input_tokens_seen": 18741648, + "step": 20720 + }, + { + "epoch": 5.469842945756896, + "grad_norm": 0.0008095014491118491, + "learning_rate": 0.14147517168220458, + "loss": 0.0604, + "num_input_tokens_seen": 18745968, + "step": 20725 + }, + { + "epoch": 5.471162729312393, + "grad_norm": 0.003949776291847229, + "learning_rate": 0.14141636268405616, + "loss": 0.069, + "num_input_tokens_seen": 18750512, + "step": 20730 + }, + { + "epoch": 5.47248251286789, + "grad_norm": 0.0030593557748943567, + "learning_rate": 0.14135755500961253, + "loss": 0.0813, + "num_input_tokens_seen": 18755056, + "step": 20735 + }, + { + "epoch": 5.473802296423386, + "grad_norm": 0.002104880753904581, + "learning_rate": 0.14129874866794245, + "loss": 0.0509, + "num_input_tokens_seen": 18759792, + "step": 20740 + }, + { + "epoch": 5.475122079978884, + "grad_norm": 0.002622429747134447, + "learning_rate": 0.14123994366811476, + "loss": 0.0588, + "num_input_tokens_seen": 18764144, + "step": 20745 + }, + { + "epoch": 5.47644186353438, + "grad_norm": 0.0013499093474820256, + "learning_rate": 0.14118114001919774, + "loss": 0.0272, + "num_input_tokens_seen": 18768752, + "step": 20750 + }, + { + "epoch": 5.477761647089877, + "grad_norm": 0.0012579091126099229, + "learning_rate": 0.14112233773025978, + "loss": 0.0581, + "num_input_tokens_seen": 18773296, + "step": 20755 + }, + { + "epoch": 5.479081430645374, + "grad_norm": 0.0029822923243045807, + "learning_rate": 0.14106353681036896, + "loss": 0.0552, + "num_input_tokens_seen": 18777904, + "step": 20760 + }, + { + "epoch": 5.480401214200871, + "grad_norm": 0.0039007235318422318, + "learning_rate": 0.14100473726859303, + "loss": 0.0668, + "num_input_tokens_seen": 18782352, + "step": 20765 + }, + { + "epoch": 5.481720997756368, + "grad_norm": 0.0014680895255878568, + "learning_rate": 0.14094593911399964, + "loss": 0.0762, + "num_input_tokens_seen": 18786576, + "step": 20770 + }, + { + "epoch": 5.483040781311865, + "grad_norm": 0.0014836450573056936, + "learning_rate": 0.14088714235565625, + "loss": 0.0505, + "num_input_tokens_seen": 18790768, + "step": 20775 + }, + { + "epoch": 5.4843605648673615, + "grad_norm": 0.0013690009946003556, + "learning_rate": 0.14082834700263, + "loss": 0.0567, + "num_input_tokens_seen": 18795184, + "step": 20780 + }, + { + "epoch": 5.485680348422859, + "grad_norm": 0.0007762904861010611, + "learning_rate": 0.14076955306398795, + "loss": 0.0587, + "num_input_tokens_seen": 18799504, + "step": 20785 + }, + { + "epoch": 5.4870001319783555, + "grad_norm": 0.0023150246124714613, + "learning_rate": 0.14071076054879675, + "loss": 0.0789, + "num_input_tokens_seen": 18803952, + "step": 20790 + }, + { + "epoch": 5.488319915533852, + "grad_norm": 0.0032278753351420164, + "learning_rate": 0.14065196946612302, + "loss": 0.0393, + "num_input_tokens_seen": 18808784, + "step": 20795 + }, + { + "epoch": 5.4896396990893495, + "grad_norm": 0.0003297505609225482, + "learning_rate": 0.1405931798250331, + "loss": 0.0593, + "num_input_tokens_seen": 18813264, + "step": 20800 + }, + { + "epoch": 5.4896396990893495, + "eval_loss": 0.08872342854738235, + "eval_runtime": 75.9658, + "eval_samples_per_second": 88.658, + "eval_steps_per_second": 22.168, + "num_input_tokens_seen": 18813264, + "step": 20800 + }, + { + "epoch": 5.490959482644846, + "grad_norm": 0.001862650620751083, + "learning_rate": 0.14053439163459308, + "loss": 0.056, + "num_input_tokens_seen": 18817904, + "step": 20805 + }, + { + "epoch": 5.4922792662003435, + "grad_norm": 0.001935381325893104, + "learning_rate": 0.14047560490386876, + "loss": 0.0626, + "num_input_tokens_seen": 18822480, + "step": 20810 + }, + { + "epoch": 5.49359904975584, + "grad_norm": 0.0012945978669449687, + "learning_rate": 0.14041681964192593, + "loss": 0.0565, + "num_input_tokens_seen": 18827120, + "step": 20815 + }, + { + "epoch": 5.494918833311337, + "grad_norm": 0.0034239247906953096, + "learning_rate": 0.14035803585782988, + "loss": 0.0815, + "num_input_tokens_seen": 18831824, + "step": 20820 + }, + { + "epoch": 5.496238616866834, + "grad_norm": 0.0012892658123746514, + "learning_rate": 0.14029925356064593, + "loss": 0.0422, + "num_input_tokens_seen": 18836496, + "step": 20825 + }, + { + "epoch": 5.497558400422331, + "grad_norm": 0.002494418527930975, + "learning_rate": 0.1402404727594389, + "loss": 0.0856, + "num_input_tokens_seen": 18841168, + "step": 20830 + }, + { + "epoch": 5.498878183977828, + "grad_norm": 0.004273512866348028, + "learning_rate": 0.1401816934632737, + "loss": 0.0647, + "num_input_tokens_seen": 18845328, + "step": 20835 + }, + { + "epoch": 5.500197967533325, + "grad_norm": 0.002872748300433159, + "learning_rate": 0.1401229156812147, + "loss": 0.109, + "num_input_tokens_seen": 18849904, + "step": 20840 + }, + { + "epoch": 5.501517751088821, + "grad_norm": 0.002420891309157014, + "learning_rate": 0.14006413942232626, + "loss": 0.0595, + "num_input_tokens_seen": 18854288, + "step": 20845 + }, + { + "epoch": 5.502837534644319, + "grad_norm": 0.0017218082211911678, + "learning_rate": 0.14000536469567235, + "loss": 0.0737, + "num_input_tokens_seen": 18858736, + "step": 20850 + }, + { + "epoch": 5.504157318199815, + "grad_norm": 0.0014793500304222107, + "learning_rate": 0.13994659151031685, + "loss": 0.0596, + "num_input_tokens_seen": 18863344, + "step": 20855 + }, + { + "epoch": 5.505477101755313, + "grad_norm": 0.0027360129170119762, + "learning_rate": 0.13988781987532323, + "loss": 0.0632, + "num_input_tokens_seen": 18868080, + "step": 20860 + }, + { + "epoch": 5.506796885310809, + "grad_norm": 0.0018278186907991767, + "learning_rate": 0.1398290497997549, + "loss": 0.0811, + "num_input_tokens_seen": 18872400, + "step": 20865 + }, + { + "epoch": 5.508116668866306, + "grad_norm": 0.001604323973879218, + "learning_rate": 0.13977028129267488, + "loss": 0.092, + "num_input_tokens_seen": 18876656, + "step": 20870 + }, + { + "epoch": 5.509436452421803, + "grad_norm": 0.0025049468968063593, + "learning_rate": 0.13971151436314605, + "loss": 0.0881, + "num_input_tokens_seen": 18880880, + "step": 20875 + }, + { + "epoch": 5.5107562359773, + "grad_norm": 0.003125397488474846, + "learning_rate": 0.13965274902023103, + "loss": 0.0643, + "num_input_tokens_seen": 18885296, + "step": 20880 + }, + { + "epoch": 5.512076019532796, + "grad_norm": 0.0005557900876738131, + "learning_rate": 0.13959398527299208, + "loss": 0.0476, + "num_input_tokens_seen": 18889776, + "step": 20885 + }, + { + "epoch": 5.513395803088294, + "grad_norm": 0.0023336666636168957, + "learning_rate": 0.13953522313049138, + "loss": 0.0741, + "num_input_tokens_seen": 18894320, + "step": 20890 + }, + { + "epoch": 5.51471558664379, + "grad_norm": 0.0010580826783552766, + "learning_rate": 0.13947646260179083, + "loss": 0.0704, + "num_input_tokens_seen": 18898992, + "step": 20895 + }, + { + "epoch": 5.516035370199288, + "grad_norm": 0.0018065155018121004, + "learning_rate": 0.13941770369595194, + "loss": 0.0901, + "num_input_tokens_seen": 18903472, + "step": 20900 + }, + { + "epoch": 5.517355153754784, + "grad_norm": 0.001340651884675026, + "learning_rate": 0.1393589464220362, + "loss": 0.0418, + "num_input_tokens_seen": 18908016, + "step": 20905 + }, + { + "epoch": 5.518674937310281, + "grad_norm": 0.0016433141427114606, + "learning_rate": 0.13930019078910455, + "loss": 0.0548, + "num_input_tokens_seen": 18912464, + "step": 20910 + }, + { + "epoch": 5.519994720865778, + "grad_norm": 0.0010273357620462775, + "learning_rate": 0.139241436806218, + "loss": 0.0661, + "num_input_tokens_seen": 18916912, + "step": 20915 + }, + { + "epoch": 5.521314504421275, + "grad_norm": 0.0009362144628539681, + "learning_rate": 0.13918268448243712, + "loss": 0.0285, + "num_input_tokens_seen": 18921552, + "step": 20920 + }, + { + "epoch": 5.522634287976771, + "grad_norm": 0.001636494416743517, + "learning_rate": 0.13912393382682217, + "loss": 0.0632, + "num_input_tokens_seen": 18926192, + "step": 20925 + }, + { + "epoch": 5.523954071532269, + "grad_norm": 0.0019387876382097602, + "learning_rate": 0.1390651848484333, + "loss": 0.0433, + "num_input_tokens_seen": 18930352, + "step": 20930 + }, + { + "epoch": 5.525273855087765, + "grad_norm": 0.0022405805066227913, + "learning_rate": 0.1390064375563304, + "loss": 0.0868, + "num_input_tokens_seen": 18934992, + "step": 20935 + }, + { + "epoch": 5.526593638643263, + "grad_norm": 0.002844392554834485, + "learning_rate": 0.13894769195957293, + "loss": 0.0843, + "num_input_tokens_seen": 18939632, + "step": 20940 + }, + { + "epoch": 5.527913422198759, + "grad_norm": 0.003327704733237624, + "learning_rate": 0.13888894806722032, + "loss": 0.0739, + "num_input_tokens_seen": 18944176, + "step": 20945 + }, + { + "epoch": 5.529233205754256, + "grad_norm": 0.0016897986643016338, + "learning_rate": 0.1388302058883315, + "loss": 0.0575, + "num_input_tokens_seen": 18948752, + "step": 20950 + }, + { + "epoch": 5.530552989309753, + "grad_norm": 0.0009434213861823082, + "learning_rate": 0.13877146543196528, + "loss": 0.0256, + "num_input_tokens_seen": 18953456, + "step": 20955 + }, + { + "epoch": 5.53187277286525, + "grad_norm": 0.0034671304747462273, + "learning_rate": 0.13871272670718027, + "loss": 0.0608, + "num_input_tokens_seen": 18958224, + "step": 20960 + }, + { + "epoch": 5.533192556420747, + "grad_norm": 0.0005786630208604038, + "learning_rate": 0.13865398972303455, + "loss": 0.0411, + "num_input_tokens_seen": 18962704, + "step": 20965 + }, + { + "epoch": 5.534512339976244, + "grad_norm": 0.0033162026666104794, + "learning_rate": 0.13859525448858623, + "loss": 0.0622, + "num_input_tokens_seen": 18967216, + "step": 20970 + }, + { + "epoch": 5.5358321235317405, + "grad_norm": 0.0012472628150135279, + "learning_rate": 0.13853652101289304, + "loss": 0.0386, + "num_input_tokens_seen": 18971696, + "step": 20975 + }, + { + "epoch": 5.537151907087238, + "grad_norm": 0.00543064484372735, + "learning_rate": 0.13847778930501234, + "loss": 0.0914, + "num_input_tokens_seen": 18976336, + "step": 20980 + }, + { + "epoch": 5.5384716906427345, + "grad_norm": 0.002410462824627757, + "learning_rate": 0.1384190593740013, + "loss": 0.04, + "num_input_tokens_seen": 18980752, + "step": 20985 + }, + { + "epoch": 5.539791474198232, + "grad_norm": 0.0008176291012205184, + "learning_rate": 0.13836033122891686, + "loss": 0.0759, + "num_input_tokens_seen": 18985360, + "step": 20990 + }, + { + "epoch": 5.5411112577537285, + "grad_norm": 0.0007733008824288845, + "learning_rate": 0.1383016048788156, + "loss": 0.0318, + "num_input_tokens_seen": 18990160, + "step": 20995 + }, + { + "epoch": 5.542431041309225, + "grad_norm": 0.0018688826821744442, + "learning_rate": 0.13824288033275392, + "loss": 0.0695, + "num_input_tokens_seen": 18994928, + "step": 21000 + }, + { + "epoch": 5.542431041309225, + "eval_loss": 0.09065626561641693, + "eval_runtime": 75.9127, + "eval_samples_per_second": 88.72, + "eval_steps_per_second": 22.183, + "num_input_tokens_seen": 18994928, + "step": 21000 + }, + { + "epoch": 5.5437508248647225, + "grad_norm": 0.00039988712524063885, + "learning_rate": 0.1381841575997878, + "loss": 0.0753, + "num_input_tokens_seen": 18999696, + "step": 21005 + }, + { + "epoch": 5.545070608420219, + "grad_norm": 0.0009217948536388576, + "learning_rate": 0.13812543668897306, + "loss": 0.0394, + "num_input_tokens_seen": 19004176, + "step": 21010 + }, + { + "epoch": 5.546390391975716, + "grad_norm": 0.0031325523741543293, + "learning_rate": 0.13806671760936526, + "loss": 0.0407, + "num_input_tokens_seen": 19008528, + "step": 21015 + }, + { + "epoch": 5.547710175531213, + "grad_norm": 0.004236098378896713, + "learning_rate": 0.13800800037001956, + "loss": 0.0994, + "num_input_tokens_seen": 19013072, + "step": 21020 + }, + { + "epoch": 5.54902995908671, + "grad_norm": 0.0019871348049491644, + "learning_rate": 0.13794928497999087, + "loss": 0.0915, + "num_input_tokens_seen": 19017712, + "step": 21025 + }, + { + "epoch": 5.550349742642207, + "grad_norm": 0.0017501862021163106, + "learning_rate": 0.1378905714483339, + "loss": 0.0719, + "num_input_tokens_seen": 19022416, + "step": 21030 + }, + { + "epoch": 5.551669526197704, + "grad_norm": 0.0019783549942076206, + "learning_rate": 0.13783185978410295, + "loss": 0.0833, + "num_input_tokens_seen": 19026800, + "step": 21035 + }, + { + "epoch": 5.5529893097532, + "grad_norm": 0.0020301423501223326, + "learning_rate": 0.13777314999635218, + "loss": 0.0885, + "num_input_tokens_seen": 19031440, + "step": 21040 + }, + { + "epoch": 5.554309093308698, + "grad_norm": 0.0018002905417233706, + "learning_rate": 0.1377144420941353, + "loss": 0.0466, + "num_input_tokens_seen": 19035888, + "step": 21045 + }, + { + "epoch": 5.555628876864194, + "grad_norm": 0.002854804042726755, + "learning_rate": 0.13765573608650586, + "loss": 0.0692, + "num_input_tokens_seen": 19040240, + "step": 21050 + }, + { + "epoch": 5.556948660419691, + "grad_norm": 0.0023517333902418613, + "learning_rate": 0.13759703198251702, + "loss": 0.0715, + "num_input_tokens_seen": 19044496, + "step": 21055 + }, + { + "epoch": 5.558268443975188, + "grad_norm": 0.003601768286898732, + "learning_rate": 0.13753832979122174, + "loss": 0.0825, + "num_input_tokens_seen": 19048848, + "step": 21060 + }, + { + "epoch": 5.559588227530685, + "grad_norm": 0.0020101764239370823, + "learning_rate": 0.13747962952167264, + "loss": 0.086, + "num_input_tokens_seen": 19053520, + "step": 21065 + }, + { + "epoch": 5.560908011086182, + "grad_norm": 0.0015593287535011768, + "learning_rate": 0.13742093118292192, + "loss": 0.0663, + "num_input_tokens_seen": 19058064, + "step": 21070 + }, + { + "epoch": 5.562227794641679, + "grad_norm": 0.001522072940133512, + "learning_rate": 0.13736223478402174, + "loss": 0.0559, + "num_input_tokens_seen": 19062832, + "step": 21075 + }, + { + "epoch": 5.563547578197175, + "grad_norm": 0.0021333477925509214, + "learning_rate": 0.1373035403340238, + "loss": 0.0651, + "num_input_tokens_seen": 19067472, + "step": 21080 + }, + { + "epoch": 5.564867361752673, + "grad_norm": 0.0032546164002269506, + "learning_rate": 0.13724484784197943, + "loss": 0.1152, + "num_input_tokens_seen": 19071696, + "step": 21085 + }, + { + "epoch": 5.566187145308169, + "grad_norm": 0.0012881509028375149, + "learning_rate": 0.13718615731693987, + "loss": 0.0925, + "num_input_tokens_seen": 19076080, + "step": 21090 + }, + { + "epoch": 5.567506928863667, + "grad_norm": 0.0013989066937938333, + "learning_rate": 0.13712746876795587, + "loss": 0.046, + "num_input_tokens_seen": 19080368, + "step": 21095 + }, + { + "epoch": 5.568826712419163, + "grad_norm": 0.001406902214512229, + "learning_rate": 0.13706878220407792, + "loss": 0.0702, + "num_input_tokens_seen": 19084816, + "step": 21100 + }, + { + "epoch": 5.57014649597466, + "grad_norm": 0.0013584858970716596, + "learning_rate": 0.13701009763435631, + "loss": 0.0395, + "num_input_tokens_seen": 19089328, + "step": 21105 + }, + { + "epoch": 5.571466279530157, + "grad_norm": 0.002203582553192973, + "learning_rate": 0.13695141506784084, + "loss": 0.0575, + "num_input_tokens_seen": 19093712, + "step": 21110 + }, + { + "epoch": 5.572786063085654, + "grad_norm": 0.004143914673477411, + "learning_rate": 0.13689273451358114, + "loss": 0.0814, + "num_input_tokens_seen": 19098352, + "step": 21115 + }, + { + "epoch": 5.574105846641151, + "grad_norm": 0.003476201556622982, + "learning_rate": 0.13683405598062653, + "loss": 0.0982, + "num_input_tokens_seen": 19102832, + "step": 21120 + }, + { + "epoch": 5.575425630196648, + "grad_norm": 0.002219526330009103, + "learning_rate": 0.1367753794780259, + "loss": 0.0504, + "num_input_tokens_seen": 19107120, + "step": 21125 + }, + { + "epoch": 5.576745413752144, + "grad_norm": 0.0015976501163095236, + "learning_rate": 0.13671670501482802, + "loss": 0.0898, + "num_input_tokens_seen": 19111984, + "step": 21130 + }, + { + "epoch": 5.578065197307642, + "grad_norm": 0.0024185602087527514, + "learning_rate": 0.1366580326000811, + "loss": 0.07, + "num_input_tokens_seen": 19116464, + "step": 21135 + }, + { + "epoch": 5.579384980863138, + "grad_norm": 0.0014076727675274014, + "learning_rate": 0.1365993622428332, + "loss": 0.0526, + "num_input_tokens_seen": 19120848, + "step": 21140 + }, + { + "epoch": 5.580704764418636, + "grad_norm": 0.0025318944826722145, + "learning_rate": 0.13654069395213211, + "loss": 0.053, + "num_input_tokens_seen": 19125296, + "step": 21145 + }, + { + "epoch": 5.582024547974132, + "grad_norm": 0.002066245535388589, + "learning_rate": 0.13648202773702509, + "loss": 0.0473, + "num_input_tokens_seen": 19129936, + "step": 21150 + }, + { + "epoch": 5.583344331529629, + "grad_norm": 0.0022949925623834133, + "learning_rate": 0.13642336360655927, + "loss": 0.1074, + "num_input_tokens_seen": 19134512, + "step": 21155 + }, + { + "epoch": 5.584664115085126, + "grad_norm": 0.0005475904908962548, + "learning_rate": 0.13636470156978145, + "loss": 0.0619, + "num_input_tokens_seen": 19139024, + "step": 21160 + }, + { + "epoch": 5.585983898640623, + "grad_norm": 0.002058550715446472, + "learning_rate": 0.13630604163573798, + "loss": 0.0485, + "num_input_tokens_seen": 19143376, + "step": 21165 + }, + { + "epoch": 5.5873036821961195, + "grad_norm": 0.0013284425949677825, + "learning_rate": 0.13624738381347495, + "loss": 0.0436, + "num_input_tokens_seen": 19147824, + "step": 21170 + }, + { + "epoch": 5.588623465751617, + "grad_norm": 0.0023227103520184755, + "learning_rate": 0.1361887281120382, + "loss": 0.0823, + "num_input_tokens_seen": 19152176, + "step": 21175 + }, + { + "epoch": 5.5899432493071135, + "grad_norm": 0.0012728304136544466, + "learning_rate": 0.13613007454047307, + "loss": 0.0698, + "num_input_tokens_seen": 19156880, + "step": 21180 + }, + { + "epoch": 5.59126303286261, + "grad_norm": 0.0025000423192977905, + "learning_rate": 0.13607142310782486, + "loss": 0.0626, + "num_input_tokens_seen": 19161232, + "step": 21185 + }, + { + "epoch": 5.5925828164181075, + "grad_norm": 0.0012536065187305212, + "learning_rate": 0.13601277382313814, + "loss": 0.0326, + "num_input_tokens_seen": 19165744, + "step": 21190 + }, + { + "epoch": 5.593902599973604, + "grad_norm": 0.0011708791134878993, + "learning_rate": 0.1359541266954575, + "loss": 0.0435, + "num_input_tokens_seen": 19170608, + "step": 21195 + }, + { + "epoch": 5.5952223835291015, + "grad_norm": 0.0034195533953607082, + "learning_rate": 0.13589548173382707, + "loss": 0.0715, + "num_input_tokens_seen": 19174960, + "step": 21200 + }, + { + "epoch": 5.5952223835291015, + "eval_loss": 0.0906238853931427, + "eval_runtime": 75.9204, + "eval_samples_per_second": 88.711, + "eval_steps_per_second": 22.181, + "num_input_tokens_seen": 19174960, + "step": 21200 + }, + { + "epoch": 5.596542167084598, + "grad_norm": 0.001959304790943861, + "learning_rate": 0.1358368389472906, + "loss": 0.0397, + "num_input_tokens_seen": 19179536, + "step": 21205 + }, + { + "epoch": 5.597861950640095, + "grad_norm": 0.0034658045042306185, + "learning_rate": 0.13577819834489155, + "loss": 0.0782, + "num_input_tokens_seen": 19184144, + "step": 21210 + }, + { + "epoch": 5.599181734195592, + "grad_norm": 0.0019318020204082131, + "learning_rate": 0.135719559935673, + "loss": 0.0699, + "num_input_tokens_seen": 19188624, + "step": 21215 + }, + { + "epoch": 5.600501517751089, + "grad_norm": 0.001897369627840817, + "learning_rate": 0.13566092372867775, + "loss": 0.0517, + "num_input_tokens_seen": 19193040, + "step": 21220 + }, + { + "epoch": 5.601821301306586, + "grad_norm": 0.0015533891273662448, + "learning_rate": 0.13560228973294833, + "loss": 0.0413, + "num_input_tokens_seen": 19198000, + "step": 21225 + }, + { + "epoch": 5.603141084862083, + "grad_norm": 0.001577372313477099, + "learning_rate": 0.13554365795752668, + "loss": 0.0828, + "num_input_tokens_seen": 19202672, + "step": 21230 + }, + { + "epoch": 5.604460868417579, + "grad_norm": 0.001716545899398625, + "learning_rate": 0.1354850284114547, + "loss": 0.0788, + "num_input_tokens_seen": 19207120, + "step": 21235 + }, + { + "epoch": 5.605780651973077, + "grad_norm": 0.003284305101260543, + "learning_rate": 0.13542640110377374, + "loss": 0.0439, + "num_input_tokens_seen": 19211600, + "step": 21240 + }, + { + "epoch": 5.607100435528573, + "grad_norm": 0.004532422870397568, + "learning_rate": 0.13536777604352487, + "loss": 0.0485, + "num_input_tokens_seen": 19216272, + "step": 21245 + }, + { + "epoch": 5.608420219084071, + "grad_norm": 0.0021123592741787434, + "learning_rate": 0.13530915323974887, + "loss": 0.0563, + "num_input_tokens_seen": 19221072, + "step": 21250 + }, + { + "epoch": 5.609740002639567, + "grad_norm": 0.0029583233408629894, + "learning_rate": 0.13525053270148596, + "loss": 0.0482, + "num_input_tokens_seen": 19225584, + "step": 21255 + }, + { + "epoch": 5.611059786195064, + "grad_norm": 0.0037554623559117317, + "learning_rate": 0.13519191443777628, + "loss": 0.0673, + "num_input_tokens_seen": 19230096, + "step": 21260 + }, + { + "epoch": 5.612379569750561, + "grad_norm": 0.0004462717624846846, + "learning_rate": 0.13513329845765953, + "loss": 0.0381, + "num_input_tokens_seen": 19234608, + "step": 21265 + }, + { + "epoch": 5.613699353306058, + "grad_norm": 0.0027709458954632282, + "learning_rate": 0.13507468477017495, + "loss": 0.0865, + "num_input_tokens_seen": 19238864, + "step": 21270 + }, + { + "epoch": 5.615019136861555, + "grad_norm": 0.002516203559935093, + "learning_rate": 0.13501607338436153, + "loss": 0.0739, + "num_input_tokens_seen": 19243344, + "step": 21275 + }, + { + "epoch": 5.616338920417052, + "grad_norm": 0.0011984226293861866, + "learning_rate": 0.13495746430925798, + "loss": 0.0476, + "num_input_tokens_seen": 19248176, + "step": 21280 + }, + { + "epoch": 5.617658703972548, + "grad_norm": 0.001728956587612629, + "learning_rate": 0.13489885755390238, + "loss": 0.0669, + "num_input_tokens_seen": 19252816, + "step": 21285 + }, + { + "epoch": 5.618978487528046, + "grad_norm": 0.0007059220806695521, + "learning_rate": 0.13484025312733275, + "loss": 0.0663, + "num_input_tokens_seen": 19257648, + "step": 21290 + }, + { + "epoch": 5.620298271083542, + "grad_norm": 0.0030708604026585817, + "learning_rate": 0.13478165103858658, + "loss": 0.062, + "num_input_tokens_seen": 19262288, + "step": 21295 + }, + { + "epoch": 5.621618054639039, + "grad_norm": 0.002792841289192438, + "learning_rate": 0.13472305129670106, + "loss": 0.0645, + "num_input_tokens_seen": 19266928, + "step": 21300 + }, + { + "epoch": 5.622937838194536, + "grad_norm": 0.002629695925861597, + "learning_rate": 0.13466445391071305, + "loss": 0.0327, + "num_input_tokens_seen": 19271152, + "step": 21305 + }, + { + "epoch": 5.624257621750033, + "grad_norm": 0.0016185062704607844, + "learning_rate": 0.13460585888965895, + "loss": 0.0826, + "num_input_tokens_seen": 19275760, + "step": 21310 + }, + { + "epoch": 5.625577405305529, + "grad_norm": 0.0023451668675988913, + "learning_rate": 0.13454726624257482, + "loss": 0.0597, + "num_input_tokens_seen": 19280432, + "step": 21315 + }, + { + "epoch": 5.626897188861027, + "grad_norm": 0.000904591113794595, + "learning_rate": 0.1344886759784965, + "loss": 0.0561, + "num_input_tokens_seen": 19284720, + "step": 21320 + }, + { + "epoch": 5.628216972416523, + "grad_norm": 0.0023082129191607237, + "learning_rate": 0.13443008810645923, + "loss": 0.0553, + "num_input_tokens_seen": 19289264, + "step": 21325 + }, + { + "epoch": 5.629536755972021, + "grad_norm": 0.0028799602296203375, + "learning_rate": 0.13437150263549807, + "loss": 0.0525, + "num_input_tokens_seen": 19293456, + "step": 21330 + }, + { + "epoch": 5.630856539527517, + "grad_norm": 0.0025252813939005136, + "learning_rate": 0.13431291957464755, + "loss": 0.0686, + "num_input_tokens_seen": 19297904, + "step": 21335 + }, + { + "epoch": 5.632176323083014, + "grad_norm": 0.0013957035262137651, + "learning_rate": 0.13425433893294197, + "loss": 0.0349, + "num_input_tokens_seen": 19302480, + "step": 21340 + }, + { + "epoch": 5.633496106638511, + "grad_norm": 0.003264862112700939, + "learning_rate": 0.13419576071941525, + "loss": 0.1044, + "num_input_tokens_seen": 19306832, + "step": 21345 + }, + { + "epoch": 5.634815890194008, + "grad_norm": 0.000876259058713913, + "learning_rate": 0.1341371849431008, + "loss": 0.0523, + "num_input_tokens_seen": 19311088, + "step": 21350 + }, + { + "epoch": 5.636135673749505, + "grad_norm": 0.002203865209594369, + "learning_rate": 0.13407861161303178, + "loss": 0.1021, + "num_input_tokens_seen": 19315632, + "step": 21355 + }, + { + "epoch": 5.637455457305002, + "grad_norm": 0.0023062601685523987, + "learning_rate": 0.13402004073824098, + "loss": 0.0738, + "num_input_tokens_seen": 19320144, + "step": 21360 + }, + { + "epoch": 5.6387752408604985, + "grad_norm": 0.0021598476450890303, + "learning_rate": 0.13396147232776062, + "loss": 0.0737, + "num_input_tokens_seen": 19324656, + "step": 21365 + }, + { + "epoch": 5.640095024415996, + "grad_norm": 0.0004246800090186298, + "learning_rate": 0.13390290639062288, + "loss": 0.0381, + "num_input_tokens_seen": 19329392, + "step": 21370 + }, + { + "epoch": 5.6414148079714925, + "grad_norm": 0.00401339353993535, + "learning_rate": 0.13384434293585917, + "loss": 0.086, + "num_input_tokens_seen": 19334288, + "step": 21375 + }, + { + "epoch": 5.64273459152699, + "grad_norm": 0.001157064689323306, + "learning_rate": 0.13378578197250088, + "loss": 0.0798, + "num_input_tokens_seen": 19338768, + "step": 21380 + }, + { + "epoch": 5.6440543750824865, + "grad_norm": 0.002148005645722151, + "learning_rate": 0.13372722350957872, + "loss": 0.0487, + "num_input_tokens_seen": 19343504, + "step": 21385 + }, + { + "epoch": 5.645374158637983, + "grad_norm": 0.0027785582933574915, + "learning_rate": 0.13366866755612322, + "loss": 0.071, + "num_input_tokens_seen": 19348176, + "step": 21390 + }, + { + "epoch": 5.6466939421934805, + "grad_norm": 0.0012671466683968902, + "learning_rate": 0.13361011412116436, + "loss": 0.0678, + "num_input_tokens_seen": 19352624, + "step": 21395 + }, + { + "epoch": 5.648013725748977, + "grad_norm": 0.001045513665303588, + "learning_rate": 0.13355156321373196, + "loss": 0.0352, + "num_input_tokens_seen": 19356976, + "step": 21400 + }, + { + "epoch": 5.648013725748977, + "eval_loss": 0.08937028050422668, + "eval_runtime": 75.9655, + "eval_samples_per_second": 88.659, + "eval_steps_per_second": 22.168, + "num_input_tokens_seen": 19356976, + "step": 21400 + }, + { + "epoch": 5.6493335093044745, + "grad_norm": 0.0025660668034106493, + "learning_rate": 0.13349301484285514, + "loss": 0.0803, + "num_input_tokens_seen": 19361488, + "step": 21405 + }, + { + "epoch": 5.650653292859971, + "grad_norm": 0.0033024896401911974, + "learning_rate": 0.13343446901756295, + "loss": 0.0512, + "num_input_tokens_seen": 19366096, + "step": 21410 + }, + { + "epoch": 5.651973076415468, + "grad_norm": 0.002898337785154581, + "learning_rate": 0.13337592574688376, + "loss": 0.0482, + "num_input_tokens_seen": 19370640, + "step": 21415 + }, + { + "epoch": 5.653292859970965, + "grad_norm": 0.002800572896376252, + "learning_rate": 0.13331738503984572, + "loss": 0.0314, + "num_input_tokens_seen": 19375248, + "step": 21420 + }, + { + "epoch": 5.654612643526462, + "grad_norm": 0.0006191066931933165, + "learning_rate": 0.1332588469054766, + "loss": 0.0441, + "num_input_tokens_seen": 19379504, + "step": 21425 + }, + { + "epoch": 5.655932427081958, + "grad_norm": 0.002770589431747794, + "learning_rate": 0.1332003113528036, + "loss": 0.051, + "num_input_tokens_seen": 19383920, + "step": 21430 + }, + { + "epoch": 5.657252210637456, + "grad_norm": 0.0012782220728695393, + "learning_rate": 0.13314177839085373, + "loss": 0.0598, + "num_input_tokens_seen": 19388624, + "step": 21435 + }, + { + "epoch": 5.658571994192952, + "grad_norm": 0.0013190227327868342, + "learning_rate": 0.13308324802865354, + "loss": 0.046, + "num_input_tokens_seen": 19393392, + "step": 21440 + }, + { + "epoch": 5.659891777748449, + "grad_norm": 0.002885436872020364, + "learning_rate": 0.13302472027522905, + "loss": 0.0789, + "num_input_tokens_seen": 19397872, + "step": 21445 + }, + { + "epoch": 5.661211561303946, + "grad_norm": 0.0035626839380711317, + "learning_rate": 0.13296619513960606, + "loss": 0.0914, + "num_input_tokens_seen": 19402288, + "step": 21450 + }, + { + "epoch": 5.662531344859443, + "grad_norm": 0.0027782630641013384, + "learning_rate": 0.1329076726308098, + "loss": 0.0845, + "num_input_tokens_seen": 19406704, + "step": 21455 + }, + { + "epoch": 5.66385112841494, + "grad_norm": 0.003254864364862442, + "learning_rate": 0.13284915275786519, + "loss": 0.0675, + "num_input_tokens_seen": 19411376, + "step": 21460 + }, + { + "epoch": 5.665170911970437, + "grad_norm": 0.0021324530243873596, + "learning_rate": 0.1327906355297968, + "loss": 0.0509, + "num_input_tokens_seen": 19416208, + "step": 21465 + }, + { + "epoch": 5.666490695525933, + "grad_norm": 0.0008746710955165327, + "learning_rate": 0.13273212095562867, + "loss": 0.052, + "num_input_tokens_seen": 19420656, + "step": 21470 + }, + { + "epoch": 5.667810479081431, + "grad_norm": 0.0027492528315633535, + "learning_rate": 0.13267360904438444, + "loss": 0.0562, + "num_input_tokens_seen": 19425200, + "step": 21475 + }, + { + "epoch": 5.669130262636927, + "grad_norm": 0.0029903927352279425, + "learning_rate": 0.1326150998050875, + "loss": 0.1015, + "num_input_tokens_seen": 19429552, + "step": 21480 + }, + { + "epoch": 5.670450046192425, + "grad_norm": 0.0027732686139643192, + "learning_rate": 0.1325565932467606, + "loss": 0.0576, + "num_input_tokens_seen": 19434032, + "step": 21485 + }, + { + "epoch": 5.671769829747921, + "grad_norm": 0.0006454483373090625, + "learning_rate": 0.13249808937842628, + "loss": 0.0442, + "num_input_tokens_seen": 19439024, + "step": 21490 + }, + { + "epoch": 5.673089613303418, + "grad_norm": 0.0015525619965046644, + "learning_rate": 0.1324395882091065, + "loss": 0.0212, + "num_input_tokens_seen": 19443952, + "step": 21495 + }, + { + "epoch": 5.674409396858915, + "grad_norm": 0.0033863596618175507, + "learning_rate": 0.13238108974782284, + "loss": 0.0804, + "num_input_tokens_seen": 19448752, + "step": 21500 + }, + { + "epoch": 5.675729180414412, + "grad_norm": 0.0004690844507422298, + "learning_rate": 0.13232259400359664, + "loss": 0.0541, + "num_input_tokens_seen": 19453200, + "step": 21505 + }, + { + "epoch": 5.677048963969909, + "grad_norm": 0.0005392250022850931, + "learning_rate": 0.13226410098544852, + "loss": 0.0672, + "num_input_tokens_seen": 19458160, + "step": 21510 + }, + { + "epoch": 5.678368747525406, + "grad_norm": 0.0016121871303766966, + "learning_rate": 0.13220561070239892, + "loss": 0.0712, + "num_input_tokens_seen": 19462576, + "step": 21515 + }, + { + "epoch": 5.679688531080902, + "grad_norm": 0.000684103521052748, + "learning_rate": 0.13214712316346783, + "loss": 0.0499, + "num_input_tokens_seen": 19467152, + "step": 21520 + }, + { + "epoch": 5.6810083146364, + "grad_norm": 0.002768878126516938, + "learning_rate": 0.13208863837767465, + "loss": 0.0711, + "num_input_tokens_seen": 19471824, + "step": 21525 + }, + { + "epoch": 5.682328098191896, + "grad_norm": 0.001452572993002832, + "learning_rate": 0.13203015635403856, + "loss": 0.0479, + "num_input_tokens_seen": 19476304, + "step": 21530 + }, + { + "epoch": 5.683647881747394, + "grad_norm": 0.0020926501601934433, + "learning_rate": 0.13197167710157817, + "loss": 0.0469, + "num_input_tokens_seen": 19480400, + "step": 21535 + }, + { + "epoch": 5.68496766530289, + "grad_norm": 0.004225618671625853, + "learning_rate": 0.13191320062931167, + "loss": 0.085, + "num_input_tokens_seen": 19484816, + "step": 21540 + }, + { + "epoch": 5.686287448858387, + "grad_norm": 0.0024945740588009357, + "learning_rate": 0.13185472694625702, + "loss": 0.0626, + "num_input_tokens_seen": 19489520, + "step": 21545 + }, + { + "epoch": 5.687607232413884, + "grad_norm": 0.0017215522238984704, + "learning_rate": 0.13179625606143142, + "loss": 0.0686, + "num_input_tokens_seen": 19493744, + "step": 21550 + }, + { + "epoch": 5.688927015969381, + "grad_norm": 0.000681965728290379, + "learning_rate": 0.13173778798385188, + "loss": 0.072, + "num_input_tokens_seen": 19498128, + "step": 21555 + }, + { + "epoch": 5.6902467995248776, + "grad_norm": 0.0026010777801275253, + "learning_rate": 0.13167932272253505, + "loss": 0.0686, + "num_input_tokens_seen": 19502896, + "step": 21560 + }, + { + "epoch": 5.691566583080375, + "grad_norm": 0.0023785955272614956, + "learning_rate": 0.1316208602864968, + "loss": 0.0438, + "num_input_tokens_seen": 19507152, + "step": 21565 + }, + { + "epoch": 5.6928863666358716, + "grad_norm": 0.0009683115058578551, + "learning_rate": 0.13156240068475292, + "loss": 0.0748, + "num_input_tokens_seen": 19511760, + "step": 21570 + }, + { + "epoch": 5.694206150191369, + "grad_norm": 0.0002918793761637062, + "learning_rate": 0.1315039439263185, + "loss": 0.0502, + "num_input_tokens_seen": 19516304, + "step": 21575 + }, + { + "epoch": 5.6955259337468656, + "grad_norm": 0.0036461292766034603, + "learning_rate": 0.13144549002020833, + "loss": 0.0871, + "num_input_tokens_seen": 19520688, + "step": 21580 + }, + { + "epoch": 5.696845717302362, + "grad_norm": 0.001140200300142169, + "learning_rate": 0.13138703897543688, + "loss": 0.0613, + "num_input_tokens_seen": 19525232, + "step": 21585 + }, + { + "epoch": 5.6981655008578596, + "grad_norm": 0.0009983070194721222, + "learning_rate": 0.1313285908010178, + "loss": 0.0587, + "num_input_tokens_seen": 19529552, + "step": 21590 + }, + { + "epoch": 5.699485284413356, + "grad_norm": 0.001565083279274404, + "learning_rate": 0.13127014550596475, + "loss": 0.0537, + "num_input_tokens_seen": 19534128, + "step": 21595 + }, + { + "epoch": 5.700805067968853, + "grad_norm": 0.0022758713457733393, + "learning_rate": 0.1312117030992906, + "loss": 0.0478, + "num_input_tokens_seen": 19538672, + "step": 21600 + }, + { + "epoch": 5.700805067968853, + "eval_loss": 0.08974321186542511, + "eval_runtime": 75.9638, + "eval_samples_per_second": 88.661, + "eval_steps_per_second": 22.168, + "num_input_tokens_seen": 19538672, + "step": 21600 + }, + { + "epoch": 5.70212485152435, + "grad_norm": 0.0008135944954119623, + "learning_rate": 0.13115326359000795, + "loss": 0.0592, + "num_input_tokens_seen": 19542864, + "step": 21605 + }, + { + "epoch": 5.703444635079847, + "grad_norm": 0.0023204160388559103, + "learning_rate": 0.13109482698712896, + "loss": 0.0452, + "num_input_tokens_seen": 19547536, + "step": 21610 + }, + { + "epoch": 5.704764418635344, + "grad_norm": 0.001397436368279159, + "learning_rate": 0.1310363932996651, + "loss": 0.0312, + "num_input_tokens_seen": 19551984, + "step": 21615 + }, + { + "epoch": 5.706084202190841, + "grad_norm": 0.002644294872879982, + "learning_rate": 0.13097796253662775, + "loss": 0.0549, + "num_input_tokens_seen": 19556112, + "step": 21620 + }, + { + "epoch": 5.707403985746337, + "grad_norm": 0.0011370670981705189, + "learning_rate": 0.1309195347070277, + "loss": 0.066, + "num_input_tokens_seen": 19560720, + "step": 21625 + }, + { + "epoch": 5.708723769301835, + "grad_norm": 0.0021080041769891977, + "learning_rate": 0.13086110981987506, + "loss": 0.1121, + "num_input_tokens_seen": 19565040, + "step": 21630 + }, + { + "epoch": 5.710043552857331, + "grad_norm": 0.0019075393211096525, + "learning_rate": 0.13080268788417987, + "loss": 0.056, + "num_input_tokens_seen": 19569584, + "step": 21635 + }, + { + "epoch": 5.711363336412829, + "grad_norm": 0.0013830342795699835, + "learning_rate": 0.1307442689089515, + "loss": 0.0728, + "num_input_tokens_seen": 19573936, + "step": 21640 + }, + { + "epoch": 5.712683119968325, + "grad_norm": 0.0019702569115906954, + "learning_rate": 0.13068585290319873, + "loss": 0.1111, + "num_input_tokens_seen": 19578864, + "step": 21645 + }, + { + "epoch": 5.714002903523822, + "grad_norm": 0.002046329667791724, + "learning_rate": 0.13062743987593026, + "loss": 0.0415, + "num_input_tokens_seen": 19583312, + "step": 21650 + }, + { + "epoch": 5.715322687079319, + "grad_norm": 0.0005128412740305066, + "learning_rate": 0.13056902983615395, + "loss": 0.0413, + "num_input_tokens_seen": 19587696, + "step": 21655 + }, + { + "epoch": 5.716642470634816, + "grad_norm": 0.0006303744157776237, + "learning_rate": 0.13051062279287742, + "loss": 0.0441, + "num_input_tokens_seen": 19592528, + "step": 21660 + }, + { + "epoch": 5.717962254190313, + "grad_norm": 0.0030376664362847805, + "learning_rate": 0.13045221875510782, + "loss": 0.0908, + "num_input_tokens_seen": 19596816, + "step": 21665 + }, + { + "epoch": 5.71928203774581, + "grad_norm": 0.003068802412599325, + "learning_rate": 0.13039381773185174, + "loss": 0.0783, + "num_input_tokens_seen": 19601360, + "step": 21670 + }, + { + "epoch": 5.720601821301306, + "grad_norm": 0.001640409231185913, + "learning_rate": 0.1303354197321153, + "loss": 0.0805, + "num_input_tokens_seen": 19606000, + "step": 21675 + }, + { + "epoch": 5.721921604856804, + "grad_norm": 0.0012255619512870908, + "learning_rate": 0.13027702476490433, + "loss": 0.0601, + "num_input_tokens_seen": 19610512, + "step": 21680 + }, + { + "epoch": 5.7232413884123, + "grad_norm": 0.0013627026928588748, + "learning_rate": 0.1302186328392239, + "loss": 0.0506, + "num_input_tokens_seen": 19614928, + "step": 21685 + }, + { + "epoch": 5.724561171967797, + "grad_norm": 0.001716910395771265, + "learning_rate": 0.130160243964079, + "loss": 0.0593, + "num_input_tokens_seen": 19619376, + "step": 21690 + }, + { + "epoch": 5.725880955523294, + "grad_norm": 0.0006060894811525941, + "learning_rate": 0.13010185814847372, + "loss": 0.1161, + "num_input_tokens_seen": 19624112, + "step": 21695 + }, + { + "epoch": 5.727200739078791, + "grad_norm": 0.0011280555045232177, + "learning_rate": 0.13004347540141192, + "loss": 0.0477, + "num_input_tokens_seen": 19628656, + "step": 21700 + }, + { + "epoch": 5.728520522634288, + "grad_norm": 0.0010434855939820409, + "learning_rate": 0.12998509573189712, + "loss": 0.0318, + "num_input_tokens_seen": 19633264, + "step": 21705 + }, + { + "epoch": 5.729840306189785, + "grad_norm": 0.0034496665466576815, + "learning_rate": 0.12992671914893203, + "loss": 0.096, + "num_input_tokens_seen": 19637904, + "step": 21710 + }, + { + "epoch": 5.7311600897452815, + "grad_norm": 0.0007661330746486783, + "learning_rate": 0.12986834566151909, + "loss": 0.0441, + "num_input_tokens_seen": 19642512, + "step": 21715 + }, + { + "epoch": 5.732479873300779, + "grad_norm": 0.0006855024257674813, + "learning_rate": 0.12980997527866028, + "loss": 0.0455, + "num_input_tokens_seen": 19647440, + "step": 21720 + }, + { + "epoch": 5.7337996568562755, + "grad_norm": 0.0031523853540420532, + "learning_rate": 0.12975160800935692, + "loss": 0.0949, + "num_input_tokens_seen": 19651568, + "step": 21725 + }, + { + "epoch": 5.735119440411772, + "grad_norm": 0.0020652152597904205, + "learning_rate": 0.12969324386261016, + "loss": 0.0495, + "num_input_tokens_seen": 19656304, + "step": 21730 + }, + { + "epoch": 5.7364392239672695, + "grad_norm": 0.0018442398868501186, + "learning_rate": 0.12963488284742034, + "loss": 0.0874, + "num_input_tokens_seen": 19660624, + "step": 21735 + }, + { + "epoch": 5.737759007522766, + "grad_norm": 0.0021464999299496412, + "learning_rate": 0.12957652497278752, + "loss": 0.0432, + "num_input_tokens_seen": 19665232, + "step": 21740 + }, + { + "epoch": 5.7390787910782635, + "grad_norm": 0.001972343074157834, + "learning_rate": 0.12951817024771117, + "loss": 0.0495, + "num_input_tokens_seen": 19669584, + "step": 21745 + }, + { + "epoch": 5.74039857463376, + "grad_norm": 0.0022900961339473724, + "learning_rate": 0.12945981868119041, + "loss": 0.0591, + "num_input_tokens_seen": 19674224, + "step": 21750 + }, + { + "epoch": 5.741718358189257, + "grad_norm": 0.003408652963116765, + "learning_rate": 0.12940147028222376, + "loss": 0.044, + "num_input_tokens_seen": 19678576, + "step": 21755 + }, + { + "epoch": 5.743038141744754, + "grad_norm": 0.0023051362950354815, + "learning_rate": 0.12934312505980916, + "loss": 0.0767, + "num_input_tokens_seen": 19683248, + "step": 21760 + }, + { + "epoch": 5.744357925300251, + "grad_norm": 0.004506882280111313, + "learning_rate": 0.1292847830229443, + "loss": 0.0985, + "num_input_tokens_seen": 19687856, + "step": 21765 + }, + { + "epoch": 5.745677708855748, + "grad_norm": 0.00195927987806499, + "learning_rate": 0.12922644418062626, + "loss": 0.0656, + "num_input_tokens_seen": 19692400, + "step": 21770 + }, + { + "epoch": 5.746997492411245, + "grad_norm": 0.0009957069996744394, + "learning_rate": 0.1291681085418515, + "loss": 0.0527, + "num_input_tokens_seen": 19696688, + "step": 21775 + }, + { + "epoch": 5.748317275966741, + "grad_norm": 0.002177776303142309, + "learning_rate": 0.12910977611561628, + "loss": 0.0435, + "num_input_tokens_seen": 19701296, + "step": 21780 + }, + { + "epoch": 5.749637059522239, + "grad_norm": 0.0019551741424947977, + "learning_rate": 0.1290514469109161, + "loss": 0.0768, + "num_input_tokens_seen": 19706160, + "step": 21785 + }, + { + "epoch": 5.750956843077735, + "grad_norm": 0.0022318370174616575, + "learning_rate": 0.128993120936746, + "loss": 0.0713, + "num_input_tokens_seen": 19710672, + "step": 21790 + }, + { + "epoch": 5.752276626633233, + "grad_norm": 0.002479752991348505, + "learning_rate": 0.12893479820210071, + "loss": 0.1041, + "num_input_tokens_seen": 19715184, + "step": 21795 + }, + { + "epoch": 5.753596410188729, + "grad_norm": 0.001686894684098661, + "learning_rate": 0.1288764787159742, + "loss": 0.0496, + "num_input_tokens_seen": 19719600, + "step": 21800 + }, + { + "epoch": 5.753596410188729, + "eval_loss": 0.08456743508577347, + "eval_runtime": 75.9179, + "eval_samples_per_second": 88.714, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 19719600, + "step": 21800 + }, + { + "epoch": 5.754916193744226, + "grad_norm": 0.0005980346468277276, + "learning_rate": 0.1288181624873601, + "loss": 0.0535, + "num_input_tokens_seen": 19724048, + "step": 21805 + }, + { + "epoch": 5.756235977299723, + "grad_norm": 0.0010806540958583355, + "learning_rate": 0.12875984952525163, + "loss": 0.054, + "num_input_tokens_seen": 19728720, + "step": 21810 + }, + { + "epoch": 5.75755576085522, + "grad_norm": 0.0009841738501563668, + "learning_rate": 0.12870153983864122, + "loss": 0.1003, + "num_input_tokens_seen": 19733232, + "step": 21815 + }, + { + "epoch": 5.758875544410717, + "grad_norm": 0.001356832217425108, + "learning_rate": 0.12864323343652104, + "loss": 0.0604, + "num_input_tokens_seen": 19737840, + "step": 21820 + }, + { + "epoch": 5.760195327966214, + "grad_norm": 0.0026105944998562336, + "learning_rate": 0.12858493032788268, + "loss": 0.0993, + "num_input_tokens_seen": 19742384, + "step": 21825 + }, + { + "epoch": 5.76151511152171, + "grad_norm": 0.002070779213681817, + "learning_rate": 0.12852663052171714, + "loss": 0.0643, + "num_input_tokens_seen": 19746672, + "step": 21830 + }, + { + "epoch": 5.762834895077208, + "grad_norm": 0.0006729314918629825, + "learning_rate": 0.12846833402701507, + "loss": 0.0402, + "num_input_tokens_seen": 19751216, + "step": 21835 + }, + { + "epoch": 5.764154678632704, + "grad_norm": 0.0019032730488106608, + "learning_rate": 0.12841004085276642, + "loss": 0.0645, + "num_input_tokens_seen": 19755824, + "step": 21840 + }, + { + "epoch": 5.765474462188201, + "grad_norm": 0.0005748066469095647, + "learning_rate": 0.12835175100796076, + "loss": 0.0754, + "num_input_tokens_seen": 19760400, + "step": 21845 + }, + { + "epoch": 5.766794245743698, + "grad_norm": 0.0017478405497968197, + "learning_rate": 0.12829346450158724, + "loss": 0.0786, + "num_input_tokens_seen": 19764688, + "step": 21850 + }, + { + "epoch": 5.768114029299195, + "grad_norm": 0.0009920023148879409, + "learning_rate": 0.12823518134263423, + "loss": 0.0862, + "num_input_tokens_seen": 19768880, + "step": 21855 + }, + { + "epoch": 5.769433812854691, + "grad_norm": 0.0012316189240664244, + "learning_rate": 0.12817690154008973, + "loss": 0.0771, + "num_input_tokens_seen": 19773296, + "step": 21860 + }, + { + "epoch": 5.770753596410189, + "grad_norm": 0.0010289973579347134, + "learning_rate": 0.12811862510294134, + "loss": 0.0391, + "num_input_tokens_seen": 19778096, + "step": 21865 + }, + { + "epoch": 5.772073379965685, + "grad_norm": 0.0009963583434000611, + "learning_rate": 0.12806035204017585, + "loss": 0.0816, + "num_input_tokens_seen": 19782704, + "step": 21870 + }, + { + "epoch": 5.773393163521183, + "grad_norm": 0.0028722379356622696, + "learning_rate": 0.12800208236077987, + "loss": 0.1078, + "num_input_tokens_seen": 19787312, + "step": 21875 + }, + { + "epoch": 5.774712947076679, + "grad_norm": 0.000913298106752336, + "learning_rate": 0.12794381607373917, + "loss": 0.0571, + "num_input_tokens_seen": 19791792, + "step": 21880 + }, + { + "epoch": 5.776032730632176, + "grad_norm": 0.0021605556830763817, + "learning_rate": 0.12788555318803924, + "loss": 0.0846, + "num_input_tokens_seen": 19796176, + "step": 21885 + }, + { + "epoch": 5.777352514187673, + "grad_norm": 0.0009462439920753241, + "learning_rate": 0.1278272937126649, + "loss": 0.0404, + "num_input_tokens_seen": 19800720, + "step": 21890 + }, + { + "epoch": 5.77867229774317, + "grad_norm": 0.0003409573109820485, + "learning_rate": 0.1277690376566005, + "loss": 0.039, + "num_input_tokens_seen": 19805264, + "step": 21895 + }, + { + "epoch": 5.779992081298667, + "grad_norm": 0.0024593870621174574, + "learning_rate": 0.12771078502882985, + "loss": 0.1, + "num_input_tokens_seen": 19809584, + "step": 21900 + }, + { + "epoch": 5.781311864854164, + "grad_norm": 0.0004383624764159322, + "learning_rate": 0.12765253583833633, + "loss": 0.0235, + "num_input_tokens_seen": 19814064, + "step": 21905 + }, + { + "epoch": 5.7826316484096605, + "grad_norm": 0.003155531594529748, + "learning_rate": 0.12759429009410256, + "loss": 0.0798, + "num_input_tokens_seen": 19818960, + "step": 21910 + }, + { + "epoch": 5.783951431965158, + "grad_norm": 0.0019226967124268413, + "learning_rate": 0.12753604780511085, + "loss": 0.0606, + "num_input_tokens_seen": 19823408, + "step": 21915 + }, + { + "epoch": 5.7852712155206545, + "grad_norm": 0.0029318707529455423, + "learning_rate": 0.12747780898034283, + "loss": 0.0955, + "num_input_tokens_seen": 19828208, + "step": 21920 + }, + { + "epoch": 5.786590999076152, + "grad_norm": 0.0012558629969134927, + "learning_rate": 0.12741957362877973, + "loss": 0.0694, + "num_input_tokens_seen": 19832688, + "step": 21925 + }, + { + "epoch": 5.7879107826316485, + "grad_norm": 0.0028870278038084507, + "learning_rate": 0.12736134175940214, + "loss": 0.083, + "num_input_tokens_seen": 19837200, + "step": 21930 + }, + { + "epoch": 5.789230566187145, + "grad_norm": 0.002798637608066201, + "learning_rate": 0.12730311338119016, + "loss": 0.0859, + "num_input_tokens_seen": 19841840, + "step": 21935 + }, + { + "epoch": 5.7905503497426425, + "grad_norm": 0.0015162875643000007, + "learning_rate": 0.12724488850312327, + "loss": 0.0596, + "num_input_tokens_seen": 19846384, + "step": 21940 + }, + { + "epoch": 5.791870133298139, + "grad_norm": 0.0026509840972721577, + "learning_rate": 0.1271866671341806, + "loss": 0.0576, + "num_input_tokens_seen": 19850864, + "step": 21945 + }, + { + "epoch": 5.7931899168536365, + "grad_norm": 0.0005631379899568856, + "learning_rate": 0.12712844928334047, + "loss": 0.0748, + "num_input_tokens_seen": 19855280, + "step": 21950 + }, + { + "epoch": 5.794509700409133, + "grad_norm": 0.0013932142173871398, + "learning_rate": 0.12707023495958095, + "loss": 0.0587, + "num_input_tokens_seen": 19859888, + "step": 21955 + }, + { + "epoch": 5.79582948396463, + "grad_norm": 0.002113285008817911, + "learning_rate": 0.12701202417187932, + "loss": 0.0744, + "num_input_tokens_seen": 19864528, + "step": 21960 + }, + { + "epoch": 5.797149267520127, + "grad_norm": 0.002206337871029973, + "learning_rate": 0.12695381692921243, + "loss": 0.0626, + "num_input_tokens_seen": 19868976, + "step": 21965 + }, + { + "epoch": 5.798469051075624, + "grad_norm": 0.0012269211001694202, + "learning_rate": 0.12689561324055665, + "loss": 0.0814, + "num_input_tokens_seen": 19873648, + "step": 21970 + }, + { + "epoch": 5.79978883463112, + "grad_norm": 0.0009675041656009853, + "learning_rate": 0.12683741311488758, + "loss": 0.0605, + "num_input_tokens_seen": 19877840, + "step": 21975 + }, + { + "epoch": 5.801108618186618, + "grad_norm": 0.001372173777781427, + "learning_rate": 0.1267792165611805, + "loss": 0.0397, + "num_input_tokens_seen": 19882288, + "step": 21980 + }, + { + "epoch": 5.802428401742114, + "grad_norm": 0.002178704831749201, + "learning_rate": 0.1267210235884101, + "loss": 0.0384, + "num_input_tokens_seen": 19886800, + "step": 21985 + }, + { + "epoch": 5.803748185297611, + "grad_norm": 0.002271460136398673, + "learning_rate": 0.12666283420555033, + "loss": 0.0722, + "num_input_tokens_seen": 19891312, + "step": 21990 + }, + { + "epoch": 5.805067968853108, + "grad_norm": 0.0016083696391433477, + "learning_rate": 0.12660464842157487, + "loss": 0.0679, + "num_input_tokens_seen": 19895888, + "step": 21995 + }, + { + "epoch": 5.806387752408605, + "grad_norm": 0.0024444174487143755, + "learning_rate": 0.1265464662454566, + "loss": 0.0872, + "num_input_tokens_seen": 19900592, + "step": 22000 + }, + { + "epoch": 5.806387752408605, + "eval_loss": 0.0861891433596611, + "eval_runtime": 75.9323, + "eval_samples_per_second": 88.697, + "eval_steps_per_second": 22.178, + "num_input_tokens_seen": 19900592, + "step": 22000 + }, + { + "epoch": 5.807707535964102, + "grad_norm": 0.002590698655694723, + "learning_rate": 0.12648828768616793, + "loss": 0.0532, + "num_input_tokens_seen": 19905264, + "step": 22005 + }, + { + "epoch": 5.809027319519599, + "grad_norm": 0.0008310721605084836, + "learning_rate": 0.12643011275268085, + "loss": 0.0799, + "num_input_tokens_seen": 19909744, + "step": 22010 + }, + { + "epoch": 5.810347103075095, + "grad_norm": 0.0018594823777675629, + "learning_rate": 0.1263719414539665, + "loss": 0.0629, + "num_input_tokens_seen": 19914160, + "step": 22015 + }, + { + "epoch": 5.811666886630593, + "grad_norm": 0.0026102596893906593, + "learning_rate": 0.1263137737989957, + "loss": 0.0881, + "num_input_tokens_seen": 19918832, + "step": 22020 + }, + { + "epoch": 5.812986670186089, + "grad_norm": 0.001827055704779923, + "learning_rate": 0.1262556097967387, + "loss": 0.0546, + "num_input_tokens_seen": 19923248, + "step": 22025 + }, + { + "epoch": 5.814306453741587, + "grad_norm": 0.0016047920798882842, + "learning_rate": 0.126197449456165, + "loss": 0.0759, + "num_input_tokens_seen": 19927696, + "step": 22030 + }, + { + "epoch": 5.815626237297083, + "grad_norm": 0.0003668820718303323, + "learning_rate": 0.12613929278624378, + "loss": 0.0816, + "num_input_tokens_seen": 19932464, + "step": 22035 + }, + { + "epoch": 5.81694602085258, + "grad_norm": 0.003013454144820571, + "learning_rate": 0.12608113979594343, + "loss": 0.0956, + "num_input_tokens_seen": 19936880, + "step": 22040 + }, + { + "epoch": 5.818265804408077, + "grad_norm": 0.0014895758358761668, + "learning_rate": 0.1260229904942319, + "loss": 0.0356, + "num_input_tokens_seen": 19941328, + "step": 22045 + }, + { + "epoch": 5.819585587963574, + "grad_norm": 0.0025205565616488457, + "learning_rate": 0.12596484489007662, + "loss": 0.0409, + "num_input_tokens_seen": 19945808, + "step": 22050 + }, + { + "epoch": 5.820905371519071, + "grad_norm": 0.001722763990983367, + "learning_rate": 0.1259067029924442, + "loss": 0.0471, + "num_input_tokens_seen": 19950160, + "step": 22055 + }, + { + "epoch": 5.822225155074568, + "grad_norm": 0.003386832308024168, + "learning_rate": 0.12584856481030096, + "loss": 0.0737, + "num_input_tokens_seen": 19954800, + "step": 22060 + }, + { + "epoch": 5.823544938630064, + "grad_norm": 0.001255544601008296, + "learning_rate": 0.12579043035261261, + "loss": 0.0446, + "num_input_tokens_seen": 19959152, + "step": 22065 + }, + { + "epoch": 5.824864722185562, + "grad_norm": 0.0004013463039882481, + "learning_rate": 0.1257322996283441, + "loss": 0.0445, + "num_input_tokens_seen": 19963504, + "step": 22070 + }, + { + "epoch": 5.826184505741058, + "grad_norm": 0.000913623021915555, + "learning_rate": 0.12567417264645994, + "loss": 0.0784, + "num_input_tokens_seen": 19968048, + "step": 22075 + }, + { + "epoch": 5.827504289296556, + "grad_norm": 0.0032923202961683273, + "learning_rate": 0.12561604941592408, + "loss": 0.0684, + "num_input_tokens_seen": 19972528, + "step": 22080 + }, + { + "epoch": 5.828824072852052, + "grad_norm": 0.0015773172490298748, + "learning_rate": 0.12555792994569978, + "loss": 0.0684, + "num_input_tokens_seen": 19976816, + "step": 22085 + }, + { + "epoch": 5.830143856407549, + "grad_norm": 0.0014849513536319137, + "learning_rate": 0.1254998142447499, + "loss": 0.0593, + "num_input_tokens_seen": 19981488, + "step": 22090 + }, + { + "epoch": 5.831463639963046, + "grad_norm": 0.0008531341445632279, + "learning_rate": 0.1254417023220365, + "loss": 0.066, + "num_input_tokens_seen": 19985776, + "step": 22095 + }, + { + "epoch": 5.832783423518543, + "grad_norm": 0.0012364954454824328, + "learning_rate": 0.12538359418652126, + "loss": 0.0896, + "num_input_tokens_seen": 19990320, + "step": 22100 + }, + { + "epoch": 5.8341032070740395, + "grad_norm": 0.0011268244124948978, + "learning_rate": 0.12532548984716513, + "loss": 0.052, + "num_input_tokens_seen": 19994736, + "step": 22105 + }, + { + "epoch": 5.835422990629537, + "grad_norm": 0.0003547364322002977, + "learning_rate": 0.12526738931292855, + "loss": 0.0655, + "num_input_tokens_seen": 19999280, + "step": 22110 + }, + { + "epoch": 5.8367427741850335, + "grad_norm": 0.0014035141794010997, + "learning_rate": 0.1252092925927714, + "loss": 0.0955, + "num_input_tokens_seen": 20003568, + "step": 22115 + }, + { + "epoch": 5.83806255774053, + "grad_norm": 0.002476332476362586, + "learning_rate": 0.12515119969565278, + "loss": 0.08, + "num_input_tokens_seen": 20007856, + "step": 22120 + }, + { + "epoch": 5.8393823412960275, + "grad_norm": 0.0010985403787344694, + "learning_rate": 0.12509311063053144, + "loss": 0.0502, + "num_input_tokens_seen": 20012720, + "step": 22125 + }, + { + "epoch": 5.840702124851524, + "grad_norm": 0.003981082234531641, + "learning_rate": 0.1250350254063655, + "loss": 0.057, + "num_input_tokens_seen": 20017104, + "step": 22130 + }, + { + "epoch": 5.8420219084070215, + "grad_norm": 0.001813658862374723, + "learning_rate": 0.1249769440321123, + "loss": 0.0757, + "num_input_tokens_seen": 20021872, + "step": 22135 + }, + { + "epoch": 5.843341691962518, + "grad_norm": 0.00016623838746454567, + "learning_rate": 0.12491886651672884, + "loss": 0.0522, + "num_input_tokens_seen": 20026544, + "step": 22140 + }, + { + "epoch": 5.844661475518015, + "grad_norm": 0.0021197712048888206, + "learning_rate": 0.12486079286917139, + "loss": 0.0629, + "num_input_tokens_seen": 20030896, + "step": 22145 + }, + { + "epoch": 5.845981259073512, + "grad_norm": 0.002349686808884144, + "learning_rate": 0.12480272309839553, + "loss": 0.0848, + "num_input_tokens_seen": 20035536, + "step": 22150 + }, + { + "epoch": 5.847301042629009, + "grad_norm": 0.0019657816737890244, + "learning_rate": 0.12474465721335648, + "loss": 0.0834, + "num_input_tokens_seen": 20040208, + "step": 22155 + }, + { + "epoch": 5.848620826184506, + "grad_norm": 0.0028846985660493374, + "learning_rate": 0.12468659522300861, + "loss": 0.054, + "num_input_tokens_seen": 20044720, + "step": 22160 + }, + { + "epoch": 5.849940609740003, + "grad_norm": 0.0010968392016366124, + "learning_rate": 0.12462853713630584, + "loss": 0.0465, + "num_input_tokens_seen": 20049040, + "step": 22165 + }, + { + "epoch": 5.851260393295499, + "grad_norm": 0.0012578939786180854, + "learning_rate": 0.12457048296220156, + "loss": 0.0414, + "num_input_tokens_seen": 20053648, + "step": 22170 + }, + { + "epoch": 5.852580176850997, + "grad_norm": 0.002384232124313712, + "learning_rate": 0.12451243270964832, + "loss": 0.0568, + "num_input_tokens_seen": 20058032, + "step": 22175 + }, + { + "epoch": 5.853899960406493, + "grad_norm": 0.0018706762930378318, + "learning_rate": 0.12445438638759827, + "loss": 0.0423, + "num_input_tokens_seen": 20062896, + "step": 22180 + }, + { + "epoch": 5.855219743961991, + "grad_norm": 0.0031530293636024, + "learning_rate": 0.1243963440050029, + "loss": 0.0745, + "num_input_tokens_seen": 20067152, + "step": 22185 + }, + { + "epoch": 5.856539527517487, + "grad_norm": 0.002679782919585705, + "learning_rate": 0.12433830557081298, + "loss": 0.0855, + "num_input_tokens_seen": 20071888, + "step": 22190 + }, + { + "epoch": 5.857859311072984, + "grad_norm": 0.0039010406471788883, + "learning_rate": 0.12428027109397889, + "loss": 0.1, + "num_input_tokens_seen": 20076624, + "step": 22195 + }, + { + "epoch": 5.859179094628481, + "grad_norm": 0.0029998349491506815, + "learning_rate": 0.12422224058345015, + "loss": 0.0757, + "num_input_tokens_seen": 20080976, + "step": 22200 + }, + { + "epoch": 5.859179094628481, + "eval_loss": 0.08798569440841675, + "eval_runtime": 75.9318, + "eval_samples_per_second": 88.698, + "eval_steps_per_second": 22.178, + "num_input_tokens_seen": 20080976, + "step": 22200 + }, + { + "epoch": 5.860498878183978, + "grad_norm": 0.0009110147948376834, + "learning_rate": 0.12416421404817583, + "loss": 0.0582, + "num_input_tokens_seen": 20085456, + "step": 22205 + }, + { + "epoch": 5.861818661739475, + "grad_norm": 0.0006758974050171673, + "learning_rate": 0.12410619149710447, + "loss": 0.0535, + "num_input_tokens_seen": 20089904, + "step": 22210 + }, + { + "epoch": 5.863138445294972, + "grad_norm": 0.002479512244462967, + "learning_rate": 0.12404817293918374, + "loss": 0.1133, + "num_input_tokens_seen": 20094352, + "step": 22215 + }, + { + "epoch": 5.864458228850468, + "grad_norm": 0.0008444216218777001, + "learning_rate": 0.12399015838336086, + "loss": 0.0343, + "num_input_tokens_seen": 20098608, + "step": 22220 + }, + { + "epoch": 5.865778012405966, + "grad_norm": 0.0022312700748443604, + "learning_rate": 0.12393214783858246, + "loss": 0.04, + "num_input_tokens_seen": 20103184, + "step": 22225 + }, + { + "epoch": 5.867097795961462, + "grad_norm": 0.0009960999013856053, + "learning_rate": 0.1238741413137944, + "loss": 0.069, + "num_input_tokens_seen": 20107536, + "step": 22230 + }, + { + "epoch": 5.868417579516959, + "grad_norm": 0.0021429993212223053, + "learning_rate": 0.12381613881794212, + "loss": 0.0464, + "num_input_tokens_seen": 20112016, + "step": 22235 + }, + { + "epoch": 5.869737363072456, + "grad_norm": 0.0016619206871837378, + "learning_rate": 0.12375814035997022, + "loss": 0.0419, + "num_input_tokens_seen": 20116464, + "step": 22240 + }, + { + "epoch": 5.871057146627953, + "grad_norm": 0.0024058225098997355, + "learning_rate": 0.12370014594882285, + "loss": 0.1057, + "num_input_tokens_seen": 20121232, + "step": 22245 + }, + { + "epoch": 5.872376930183449, + "grad_norm": 0.002673058770596981, + "learning_rate": 0.12364215559344356, + "loss": 0.0525, + "num_input_tokens_seen": 20125744, + "step": 22250 + }, + { + "epoch": 5.873696713738947, + "grad_norm": 0.002099599689245224, + "learning_rate": 0.12358416930277506, + "loss": 0.0478, + "num_input_tokens_seen": 20130320, + "step": 22255 + }, + { + "epoch": 5.875016497294443, + "grad_norm": 0.0004180375544819981, + "learning_rate": 0.1235261870857596, + "loss": 0.0439, + "num_input_tokens_seen": 20134608, + "step": 22260 + }, + { + "epoch": 5.876336280849941, + "grad_norm": 0.0012248620623722672, + "learning_rate": 0.12346820895133884, + "loss": 0.041, + "num_input_tokens_seen": 20138864, + "step": 22265 + }, + { + "epoch": 5.877656064405437, + "grad_norm": 0.0019809172954410315, + "learning_rate": 0.12341023490845361, + "loss": 0.0743, + "num_input_tokens_seen": 20143824, + "step": 22270 + }, + { + "epoch": 5.878975847960934, + "grad_norm": 0.00315026193857193, + "learning_rate": 0.12335226496604437, + "loss": 0.0952, + "num_input_tokens_seen": 20148368, + "step": 22275 + }, + { + "epoch": 5.880295631516431, + "grad_norm": 0.002660211641341448, + "learning_rate": 0.12329429913305069, + "loss": 0.0636, + "num_input_tokens_seen": 20152912, + "step": 22280 + }, + { + "epoch": 5.881615415071928, + "grad_norm": 0.0005873769405297935, + "learning_rate": 0.12323633741841171, + "loss": 0.0665, + "num_input_tokens_seen": 20157488, + "step": 22285 + }, + { + "epoch": 5.882935198627425, + "grad_norm": 0.001522605656646192, + "learning_rate": 0.12317837983106583, + "loss": 0.0732, + "num_input_tokens_seen": 20161776, + "step": 22290 + }, + { + "epoch": 5.884254982182922, + "grad_norm": 0.0020771375857293606, + "learning_rate": 0.12312042637995087, + "loss": 0.0802, + "num_input_tokens_seen": 20166672, + "step": 22295 + }, + { + "epoch": 5.8855747657384185, + "grad_norm": 0.0008572450024075806, + "learning_rate": 0.12306247707400389, + "loss": 0.0527, + "num_input_tokens_seen": 20171184, + "step": 22300 + }, + { + "epoch": 5.886894549293916, + "grad_norm": 0.003104955656453967, + "learning_rate": 0.12300453192216154, + "loss": 0.0747, + "num_input_tokens_seen": 20175568, + "step": 22305 + }, + { + "epoch": 5.8882143328494125, + "grad_norm": 0.002037008525803685, + "learning_rate": 0.12294659093335956, + "loss": 0.0551, + "num_input_tokens_seen": 20180208, + "step": 22310 + }, + { + "epoch": 5.88953411640491, + "grad_norm": 0.001974188955500722, + "learning_rate": 0.12288865411653327, + "loss": 0.0916, + "num_input_tokens_seen": 20184720, + "step": 22315 + }, + { + "epoch": 5.8908538999604065, + "grad_norm": 0.0023441349621862173, + "learning_rate": 0.12283072148061717, + "loss": 0.039, + "num_input_tokens_seen": 20189584, + "step": 22320 + }, + { + "epoch": 5.892173683515903, + "grad_norm": 0.0025695650838315487, + "learning_rate": 0.12277279303454529, + "loss": 0.0923, + "num_input_tokens_seen": 20194288, + "step": 22325 + }, + { + "epoch": 5.8934934670714005, + "grad_norm": 0.000620911770965904, + "learning_rate": 0.12271486878725091, + "loss": 0.0687, + "num_input_tokens_seen": 20198736, + "step": 22330 + }, + { + "epoch": 5.894813250626897, + "grad_norm": 0.0017029113369062543, + "learning_rate": 0.12265694874766658, + "loss": 0.05, + "num_input_tokens_seen": 20203248, + "step": 22335 + }, + { + "epoch": 5.8961330341823945, + "grad_norm": 0.0008482434204779565, + "learning_rate": 0.12259903292472435, + "loss": 0.0827, + "num_input_tokens_seen": 20207824, + "step": 22340 + }, + { + "epoch": 5.897452817737891, + "grad_norm": 0.002673597075045109, + "learning_rate": 0.12254112132735567, + "loss": 0.0876, + "num_input_tokens_seen": 20212432, + "step": 22345 + }, + { + "epoch": 5.898772601293388, + "grad_norm": 0.0013896391028538346, + "learning_rate": 0.12248321396449108, + "loss": 0.0611, + "num_input_tokens_seen": 20216752, + "step": 22350 + }, + { + "epoch": 5.900092384848885, + "grad_norm": 0.00044252772931940854, + "learning_rate": 0.12242531084506075, + "loss": 0.0464, + "num_input_tokens_seen": 20221648, + "step": 22355 + }, + { + "epoch": 5.901412168404382, + "grad_norm": 0.002166610211133957, + "learning_rate": 0.122367411977994, + "loss": 0.0732, + "num_input_tokens_seen": 20226032, + "step": 22360 + }, + { + "epoch": 5.902731951959878, + "grad_norm": 0.0015644388040527701, + "learning_rate": 0.12230951737221954, + "loss": 0.0642, + "num_input_tokens_seen": 20230640, + "step": 22365 + }, + { + "epoch": 5.904051735515376, + "grad_norm": 0.0028347685001790524, + "learning_rate": 0.12225162703666555, + "loss": 0.0561, + "num_input_tokens_seen": 20234928, + "step": 22370 + }, + { + "epoch": 5.905371519070872, + "grad_norm": 0.0015417387476190925, + "learning_rate": 0.1221937409802593, + "loss": 0.0364, + "num_input_tokens_seen": 20239344, + "step": 22375 + }, + { + "epoch": 5.90669130262637, + "grad_norm": 0.0013665574369952083, + "learning_rate": 0.12213585921192768, + "loss": 0.0542, + "num_input_tokens_seen": 20243792, + "step": 22380 + }, + { + "epoch": 5.908011086181866, + "grad_norm": 0.0028142468072474003, + "learning_rate": 0.1220779817405967, + "loss": 0.0909, + "num_input_tokens_seen": 20248336, + "step": 22385 + }, + { + "epoch": 5.909330869737363, + "grad_norm": 0.0026966736186295748, + "learning_rate": 0.12202010857519181, + "loss": 0.0393, + "num_input_tokens_seen": 20252816, + "step": 22390 + }, + { + "epoch": 5.91065065329286, + "grad_norm": 0.0013890733243897557, + "learning_rate": 0.12196223972463785, + "loss": 0.0814, + "num_input_tokens_seen": 20257616, + "step": 22395 + }, + { + "epoch": 5.911970436848357, + "grad_norm": 0.0019858877640217543, + "learning_rate": 0.12190437519785885, + "loss": 0.0351, + "num_input_tokens_seen": 20262096, + "step": 22400 + }, + { + "epoch": 5.911970436848357, + "eval_loss": 0.08391038328409195, + "eval_runtime": 75.8949, + "eval_samples_per_second": 88.741, + "eval_steps_per_second": 22.189, + "num_input_tokens_seen": 20262096, + "step": 22400 + }, + { + "epoch": 5.913290220403853, + "grad_norm": 0.003868117230013013, + "learning_rate": 0.12184651500377823, + "loss": 0.0522, + "num_input_tokens_seen": 20266640, + "step": 22405 + }, + { + "epoch": 5.914610003959351, + "grad_norm": 0.0017100060358643532, + "learning_rate": 0.12178865915131885, + "loss": 0.0605, + "num_input_tokens_seen": 20271024, + "step": 22410 + }, + { + "epoch": 5.915929787514847, + "grad_norm": 0.0008616366540081799, + "learning_rate": 0.1217308076494027, + "loss": 0.0303, + "num_input_tokens_seen": 20275504, + "step": 22415 + }, + { + "epoch": 5.917249571070345, + "grad_norm": 0.0012998442398384213, + "learning_rate": 0.12167296050695134, + "loss": 0.1371, + "num_input_tokens_seen": 20280080, + "step": 22420 + }, + { + "epoch": 5.918569354625841, + "grad_norm": 0.0027824551798403263, + "learning_rate": 0.12161511773288536, + "loss": 0.0268, + "num_input_tokens_seen": 20284784, + "step": 22425 + }, + { + "epoch": 5.919889138181338, + "grad_norm": 0.001850559376180172, + "learning_rate": 0.121557279336125, + "loss": 0.0636, + "num_input_tokens_seen": 20289392, + "step": 22430 + }, + { + "epoch": 5.921208921736835, + "grad_norm": 0.0007609187159687281, + "learning_rate": 0.12149944532558957, + "loss": 0.1039, + "num_input_tokens_seen": 20293680, + "step": 22435 + }, + { + "epoch": 5.922528705292332, + "grad_norm": 0.0016129029681906104, + "learning_rate": 0.12144161571019785, + "loss": 0.0603, + "num_input_tokens_seen": 20298096, + "step": 22440 + }, + { + "epoch": 5.923848488847829, + "grad_norm": 0.0009210467105731368, + "learning_rate": 0.12138379049886781, + "loss": 0.0686, + "num_input_tokens_seen": 20302608, + "step": 22445 + }, + { + "epoch": 5.925168272403326, + "grad_norm": 0.001915776520036161, + "learning_rate": 0.12132596970051697, + "loss": 0.0864, + "num_input_tokens_seen": 20306864, + "step": 22450 + }, + { + "epoch": 5.926488055958822, + "grad_norm": 0.001831753645092249, + "learning_rate": 0.12126815332406189, + "loss": 0.044, + "num_input_tokens_seen": 20311184, + "step": 22455 + }, + { + "epoch": 5.92780783951432, + "grad_norm": 0.0019456815207377076, + "learning_rate": 0.12121034137841868, + "loss": 0.0604, + "num_input_tokens_seen": 20315472, + "step": 22460 + }, + { + "epoch": 5.929127623069816, + "grad_norm": 0.002721516415476799, + "learning_rate": 0.12115253387250258, + "loss": 0.0414, + "num_input_tokens_seen": 20320144, + "step": 22465 + }, + { + "epoch": 5.930447406625314, + "grad_norm": 0.002387170447036624, + "learning_rate": 0.12109473081522831, + "loss": 0.0688, + "num_input_tokens_seen": 20324592, + "step": 22470 + }, + { + "epoch": 5.9317671901808104, + "grad_norm": 0.0006639512721449137, + "learning_rate": 0.12103693221550982, + "loss": 0.0659, + "num_input_tokens_seen": 20329520, + "step": 22475 + }, + { + "epoch": 5.933086973736307, + "grad_norm": 0.0011593005619943142, + "learning_rate": 0.12097913808226027, + "loss": 0.0386, + "num_input_tokens_seen": 20334096, + "step": 22480 + }, + { + "epoch": 5.9344067572918044, + "grad_norm": 0.0035999563988298178, + "learning_rate": 0.12092134842439234, + "loss": 0.065, + "num_input_tokens_seen": 20338960, + "step": 22485 + }, + { + "epoch": 5.935726540847301, + "grad_norm": 0.0010886540403589606, + "learning_rate": 0.12086356325081798, + "loss": 0.0778, + "num_input_tokens_seen": 20343376, + "step": 22490 + }, + { + "epoch": 5.937046324402798, + "grad_norm": 0.001369823352433741, + "learning_rate": 0.12080578257044824, + "loss": 0.0594, + "num_input_tokens_seen": 20348272, + "step": 22495 + }, + { + "epoch": 5.938366107958295, + "grad_norm": 0.001243081409484148, + "learning_rate": 0.12074800639219378, + "loss": 0.0475, + "num_input_tokens_seen": 20353040, + "step": 22500 + }, + { + "epoch": 5.939685891513792, + "grad_norm": 0.0028554387390613556, + "learning_rate": 0.12069023472496428, + "loss": 0.0775, + "num_input_tokens_seen": 20357104, + "step": 22505 + }, + { + "epoch": 5.941005675069289, + "grad_norm": 0.003667747601866722, + "learning_rate": 0.12063246757766893, + "loss": 0.0454, + "num_input_tokens_seen": 20361776, + "step": 22510 + }, + { + "epoch": 5.942325458624786, + "grad_norm": 0.0033814329653978348, + "learning_rate": 0.12057470495921618, + "loss": 0.11, + "num_input_tokens_seen": 20366224, + "step": 22515 + }, + { + "epoch": 5.943645242180282, + "grad_norm": 0.0016432743286713958, + "learning_rate": 0.12051694687851364, + "loss": 0.0673, + "num_input_tokens_seen": 20370608, + "step": 22520 + }, + { + "epoch": 5.94496502573578, + "grad_norm": 0.0019962391816079617, + "learning_rate": 0.12045919334446839, + "loss": 0.0671, + "num_input_tokens_seen": 20375152, + "step": 22525 + }, + { + "epoch": 5.946284809291276, + "grad_norm": 0.0005223580519668758, + "learning_rate": 0.12040144436598683, + "loss": 0.0857, + "num_input_tokens_seen": 20379920, + "step": 22530 + }, + { + "epoch": 5.947604592846773, + "grad_norm": 0.001066565397195518, + "learning_rate": 0.12034369995197444, + "loss": 0.0438, + "num_input_tokens_seen": 20384496, + "step": 22535 + }, + { + "epoch": 5.94892437640227, + "grad_norm": 0.0017185697797685862, + "learning_rate": 0.12028596011133627, + "loss": 0.072, + "num_input_tokens_seen": 20388528, + "step": 22540 + }, + { + "epoch": 5.950244159957767, + "grad_norm": 0.002629413502290845, + "learning_rate": 0.12022822485297643, + "loss": 0.0873, + "num_input_tokens_seen": 20392880, + "step": 22545 + }, + { + "epoch": 5.951563943513264, + "grad_norm": 0.0029650041833519936, + "learning_rate": 0.12017049418579843, + "loss": 0.0425, + "num_input_tokens_seen": 20397424, + "step": 22550 + }, + { + "epoch": 5.952883727068761, + "grad_norm": 0.0009793160716071725, + "learning_rate": 0.12011276811870514, + "loss": 0.0498, + "num_input_tokens_seen": 20402096, + "step": 22555 + }, + { + "epoch": 5.954203510624257, + "grad_norm": 0.0028427031356841326, + "learning_rate": 0.12005504666059852, + "loss": 0.0694, + "num_input_tokens_seen": 20406864, + "step": 22560 + }, + { + "epoch": 5.955523294179755, + "grad_norm": 0.001658789231441915, + "learning_rate": 0.11999732982038003, + "loss": 0.0469, + "num_input_tokens_seen": 20411312, + "step": 22565 + }, + { + "epoch": 5.956843077735251, + "grad_norm": 0.0029079290106892586, + "learning_rate": 0.11993961760695038, + "loss": 0.0608, + "num_input_tokens_seen": 20415952, + "step": 22570 + }, + { + "epoch": 5.958162861290749, + "grad_norm": 0.0008802649099379778, + "learning_rate": 0.11988191002920942, + "loss": 0.0481, + "num_input_tokens_seen": 20420496, + "step": 22575 + }, + { + "epoch": 5.959482644846245, + "grad_norm": 0.002318193670362234, + "learning_rate": 0.11982420709605641, + "loss": 0.06, + "num_input_tokens_seen": 20425072, + "step": 22580 + }, + { + "epoch": 5.960802428401742, + "grad_norm": 0.004641298670321703, + "learning_rate": 0.11976650881638991, + "loss": 0.0599, + "num_input_tokens_seen": 20429392, + "step": 22585 + }, + { + "epoch": 5.962122211957239, + "grad_norm": 0.003298509633168578, + "learning_rate": 0.11970881519910764, + "loss": 0.0571, + "num_input_tokens_seen": 20433808, + "step": 22590 + }, + { + "epoch": 5.963441995512736, + "grad_norm": 0.002359880367293954, + "learning_rate": 0.1196511262531068, + "loss": 0.0398, + "num_input_tokens_seen": 20438032, + "step": 22595 + }, + { + "epoch": 5.964761779068233, + "grad_norm": 0.002488996833562851, + "learning_rate": 0.11959344198728361, + "loss": 0.0361, + "num_input_tokens_seen": 20442992, + "step": 22600 + }, + { + "epoch": 5.964761779068233, + "eval_loss": 0.08796610683202744, + "eval_runtime": 75.8517, + "eval_samples_per_second": 88.792, + "eval_steps_per_second": 22.201, + "num_input_tokens_seen": 20442992, + "step": 22600 + }, + { + "epoch": 5.96608156262373, + "grad_norm": 0.0008050224278122187, + "learning_rate": 0.11953576241053378, + "loss": 0.0889, + "num_input_tokens_seen": 20447440, + "step": 22605 + }, + { + "epoch": 5.967401346179226, + "grad_norm": 0.0005129599594511092, + "learning_rate": 0.11947808753175228, + "loss": 0.0684, + "num_input_tokens_seen": 20451920, + "step": 22610 + }, + { + "epoch": 5.968721129734724, + "grad_norm": 0.0020836240146309137, + "learning_rate": 0.1194204173598332, + "loss": 0.0712, + "num_input_tokens_seen": 20456464, + "step": 22615 + }, + { + "epoch": 5.97004091329022, + "grad_norm": 0.002719617448747158, + "learning_rate": 0.11936275190367007, + "loss": 0.0587, + "num_input_tokens_seen": 20460880, + "step": 22620 + }, + { + "epoch": 5.971360696845718, + "grad_norm": 0.001787084387615323, + "learning_rate": 0.11930509117215563, + "loss": 0.0434, + "num_input_tokens_seen": 20465552, + "step": 22625 + }, + { + "epoch": 5.972680480401214, + "grad_norm": 0.006078392267227173, + "learning_rate": 0.11924743517418179, + "loss": 0.0671, + "num_input_tokens_seen": 20469872, + "step": 22630 + }, + { + "epoch": 5.974000263956711, + "grad_norm": 0.0016961536603048444, + "learning_rate": 0.11918978391864, + "loss": 0.0451, + "num_input_tokens_seen": 20474512, + "step": 22635 + }, + { + "epoch": 5.975320047512208, + "grad_norm": 0.0010815964778885245, + "learning_rate": 0.11913213741442065, + "loss": 0.0405, + "num_input_tokens_seen": 20479504, + "step": 22640 + }, + { + "epoch": 5.976639831067705, + "grad_norm": 0.0031426793430000544, + "learning_rate": 0.11907449567041364, + "loss": 0.0431, + "num_input_tokens_seen": 20484016, + "step": 22645 + }, + { + "epoch": 5.9779596146232015, + "grad_norm": 0.002072381554171443, + "learning_rate": 0.11901685869550803, + "loss": 0.0547, + "num_input_tokens_seen": 20488624, + "step": 22650 + }, + { + "epoch": 5.979279398178699, + "grad_norm": 0.002143830293789506, + "learning_rate": 0.1189592264985922, + "loss": 0.0645, + "num_input_tokens_seen": 20493168, + "step": 22655 + }, + { + "epoch": 5.9805991817341955, + "grad_norm": 0.00154978153295815, + "learning_rate": 0.11890159908855373, + "loss": 0.0397, + "num_input_tokens_seen": 20497552, + "step": 22660 + }, + { + "epoch": 5.981918965289692, + "grad_norm": 0.005239961668848991, + "learning_rate": 0.11884397647427941, + "loss": 0.0684, + "num_input_tokens_seen": 20501936, + "step": 22665 + }, + { + "epoch": 5.9832387488451895, + "grad_norm": 0.0006096283323131502, + "learning_rate": 0.11878635866465546, + "loss": 0.0727, + "num_input_tokens_seen": 20506576, + "step": 22670 + }, + { + "epoch": 5.984558532400686, + "grad_norm": 0.0023438534699380398, + "learning_rate": 0.11872874566856734, + "loss": 0.0779, + "num_input_tokens_seen": 20511056, + "step": 22675 + }, + { + "epoch": 5.9858783159561835, + "grad_norm": 0.0017089679604396224, + "learning_rate": 0.11867113749489955, + "loss": 0.0632, + "num_input_tokens_seen": 20515472, + "step": 22680 + }, + { + "epoch": 5.98719809951168, + "grad_norm": 0.0016378412256017327, + "learning_rate": 0.11861353415253607, + "loss": 0.0723, + "num_input_tokens_seen": 20520144, + "step": 22685 + }, + { + "epoch": 5.988517883067177, + "grad_norm": 0.0013281163992360234, + "learning_rate": 0.11855593565036011, + "loss": 0.0637, + "num_input_tokens_seen": 20524880, + "step": 22690 + }, + { + "epoch": 5.989837666622674, + "grad_norm": 0.001014670473523438, + "learning_rate": 0.11849834199725394, + "loss": 0.0939, + "num_input_tokens_seen": 20529136, + "step": 22695 + }, + { + "epoch": 5.991157450178171, + "grad_norm": 0.0023684173356741667, + "learning_rate": 0.1184407532020994, + "loss": 0.098, + "num_input_tokens_seen": 20533616, + "step": 22700 + }, + { + "epoch": 5.992477233733668, + "grad_norm": 0.004007918294519186, + "learning_rate": 0.11838316927377723, + "loss": 0.0784, + "num_input_tokens_seen": 20538224, + "step": 22705 + }, + { + "epoch": 5.993797017289165, + "grad_norm": 0.0008866480784490705, + "learning_rate": 0.11832559022116766, + "loss": 0.0481, + "num_input_tokens_seen": 20542832, + "step": 22710 + }, + { + "epoch": 5.995116800844661, + "grad_norm": 0.00114531465806067, + "learning_rate": 0.11826801605315022, + "loss": 0.0604, + "num_input_tokens_seen": 20547440, + "step": 22715 + }, + { + "epoch": 5.996436584400159, + "grad_norm": 0.0024601672776043415, + "learning_rate": 0.1182104467786034, + "loss": 0.0726, + "num_input_tokens_seen": 20552112, + "step": 22720 + }, + { + "epoch": 5.997756367955655, + "grad_norm": 0.0009367665625177324, + "learning_rate": 0.1181528824064052, + "loss": 0.0334, + "num_input_tokens_seen": 20556624, + "step": 22725 + }, + { + "epoch": 5.999076151511153, + "grad_norm": 0.001852698507718742, + "learning_rate": 0.11809532294543279, + "loss": 0.0522, + "num_input_tokens_seen": 20560912, + "step": 22730 + }, + { + "epoch": 6.000263956711099, + "grad_norm": 0.0013221424305811524, + "learning_rate": 0.11803776840456245, + "loss": 0.0501, + "num_input_tokens_seen": 20564768, + "step": 22735 + }, + { + "epoch": 6.001583740266597, + "grad_norm": 0.0015898814890533686, + "learning_rate": 0.11798021879266997, + "loss": 0.034, + "num_input_tokens_seen": 20569440, + "step": 22740 + }, + { + "epoch": 6.002903523822093, + "grad_norm": 0.0005157674313522875, + "learning_rate": 0.11792267411863006, + "loss": 0.0239, + "num_input_tokens_seen": 20573984, + "step": 22745 + }, + { + "epoch": 6.00422330737759, + "grad_norm": 0.0014355105813592672, + "learning_rate": 0.1178651343913169, + "loss": 0.0322, + "num_input_tokens_seen": 20578624, + "step": 22750 + }, + { + "epoch": 6.005543090933087, + "grad_norm": 0.001460685976780951, + "learning_rate": 0.11780759961960392, + "loss": 0.0434, + "num_input_tokens_seen": 20583168, + "step": 22755 + }, + { + "epoch": 6.006862874488584, + "grad_norm": 0.0032171981874853373, + "learning_rate": 0.1177500698123636, + "loss": 0.0572, + "num_input_tokens_seen": 20587840, + "step": 22760 + }, + { + "epoch": 6.008182658044081, + "grad_norm": 0.0012361779808998108, + "learning_rate": 0.11769254497846778, + "loss": 0.0341, + "num_input_tokens_seen": 20592704, + "step": 22765 + }, + { + "epoch": 6.009502441599578, + "grad_norm": 0.0007540935184806585, + "learning_rate": 0.11763502512678758, + "loss": 0.0308, + "num_input_tokens_seen": 20597216, + "step": 22770 + }, + { + "epoch": 6.010822225155074, + "grad_norm": 0.000834271137136966, + "learning_rate": 0.11757751026619315, + "loss": 0.0446, + "num_input_tokens_seen": 20601792, + "step": 22775 + }, + { + "epoch": 6.012142008710572, + "grad_norm": 0.001981583423912525, + "learning_rate": 0.11752000040555416, + "loss": 0.0441, + "num_input_tokens_seen": 20606112, + "step": 22780 + }, + { + "epoch": 6.013461792266068, + "grad_norm": 0.004522365052253008, + "learning_rate": 0.11746249555373921, + "loss": 0.0325, + "num_input_tokens_seen": 20610592, + "step": 22785 + }, + { + "epoch": 6.014781575821566, + "grad_norm": 0.0015850389609113336, + "learning_rate": 0.11740499571961638, + "loss": 0.0304, + "num_input_tokens_seen": 20615136, + "step": 22790 + }, + { + "epoch": 6.016101359377062, + "grad_norm": 0.0017822060035541654, + "learning_rate": 0.11734750091205279, + "loss": 0.0253, + "num_input_tokens_seen": 20619520, + "step": 22795 + }, + { + "epoch": 6.017421142932559, + "grad_norm": 0.002182852476835251, + "learning_rate": 0.11729001113991493, + "loss": 0.0276, + "num_input_tokens_seen": 20624000, + "step": 22800 + }, + { + "epoch": 6.017421142932559, + "eval_loss": 0.09123595803976059, + "eval_runtime": 75.9421, + "eval_samples_per_second": 88.686, + "eval_steps_per_second": 22.175, + "num_input_tokens_seen": 20624000, + "step": 22800 + }, + { + "epoch": 6.018740926488056, + "grad_norm": 0.0007058345945551991, + "learning_rate": 0.11723252641206837, + "loss": 0.0243, + "num_input_tokens_seen": 20628480, + "step": 22805 + }, + { + "epoch": 6.020060710043553, + "grad_norm": 0.0010840531904250383, + "learning_rate": 0.11717504673737808, + "loss": 0.0711, + "num_input_tokens_seen": 20633056, + "step": 22810 + }, + { + "epoch": 6.021380493599049, + "grad_norm": 0.0007700729183852673, + "learning_rate": 0.11711757212470802, + "loss": 0.0181, + "num_input_tokens_seen": 20637472, + "step": 22815 + }, + { + "epoch": 6.022700277154547, + "grad_norm": 0.0008926500449888408, + "learning_rate": 0.11706010258292165, + "loss": 0.0364, + "num_input_tokens_seen": 20641920, + "step": 22820 + }, + { + "epoch": 6.024020060710043, + "grad_norm": 0.0024228408001363277, + "learning_rate": 0.11700263812088131, + "loss": 0.0438, + "num_input_tokens_seen": 20646464, + "step": 22825 + }, + { + "epoch": 6.025339844265541, + "grad_norm": 0.001843856181949377, + "learning_rate": 0.11694517874744892, + "loss": 0.0454, + "num_input_tokens_seen": 20650688, + "step": 22830 + }, + { + "epoch": 6.026659627821037, + "grad_norm": 0.0014947089366614819, + "learning_rate": 0.11688772447148532, + "loss": 0.0319, + "num_input_tokens_seen": 20655200, + "step": 22835 + }, + { + "epoch": 6.027979411376534, + "grad_norm": 0.002660257974639535, + "learning_rate": 0.11683027530185074, + "loss": 0.0458, + "num_input_tokens_seen": 20659680, + "step": 22840 + }, + { + "epoch": 6.029299194932031, + "grad_norm": 0.0013840011088177562, + "learning_rate": 0.11677283124740451, + "loss": 0.0409, + "num_input_tokens_seen": 20664288, + "step": 22845 + }, + { + "epoch": 6.030618978487528, + "grad_norm": 0.0013007069937884808, + "learning_rate": 0.11671539231700531, + "loss": 0.0365, + "num_input_tokens_seen": 20668864, + "step": 22850 + }, + { + "epoch": 6.031938762043025, + "grad_norm": 0.0017084567807614803, + "learning_rate": 0.11665795851951084, + "loss": 0.06, + "num_input_tokens_seen": 20673632, + "step": 22855 + }, + { + "epoch": 6.033258545598522, + "grad_norm": 0.00185254099778831, + "learning_rate": 0.11660052986377825, + "loss": 0.0289, + "num_input_tokens_seen": 20678176, + "step": 22860 + }, + { + "epoch": 6.0345783291540185, + "grad_norm": 0.001438627950847149, + "learning_rate": 0.1165431063586636, + "loss": 0.0611, + "num_input_tokens_seen": 20683040, + "step": 22865 + }, + { + "epoch": 6.035898112709516, + "grad_norm": 0.003509717993438244, + "learning_rate": 0.11648568801302245, + "loss": 0.064, + "num_input_tokens_seen": 20687648, + "step": 22870 + }, + { + "epoch": 6.0372178962650125, + "grad_norm": 0.003416827181354165, + "learning_rate": 0.11642827483570937, + "loss": 0.0459, + "num_input_tokens_seen": 20692128, + "step": 22875 + }, + { + "epoch": 6.038537679820509, + "grad_norm": 0.0010443245992064476, + "learning_rate": 0.11637086683557815, + "loss": 0.0386, + "num_input_tokens_seen": 20696640, + "step": 22880 + }, + { + "epoch": 6.0398574633760065, + "grad_norm": 0.0022279308177530766, + "learning_rate": 0.11631346402148188, + "loss": 0.0252, + "num_input_tokens_seen": 20701152, + "step": 22885 + }, + { + "epoch": 6.041177246931503, + "grad_norm": 0.0033509735949337482, + "learning_rate": 0.11625606640227285, + "loss": 0.0383, + "num_input_tokens_seen": 20705856, + "step": 22890 + }, + { + "epoch": 6.0424970304870005, + "grad_norm": 0.0018055472755804658, + "learning_rate": 0.11619867398680238, + "loss": 0.0231, + "num_input_tokens_seen": 20710272, + "step": 22895 + }, + { + "epoch": 6.043816814042497, + "grad_norm": 0.0017197475535795093, + "learning_rate": 0.11614128678392119, + "loss": 0.0492, + "num_input_tokens_seen": 20714976, + "step": 22900 + }, + { + "epoch": 6.045136597597994, + "grad_norm": 0.002460555639117956, + "learning_rate": 0.11608390480247906, + "loss": 0.0269, + "num_input_tokens_seen": 20719488, + "step": 22905 + }, + { + "epoch": 6.046456381153491, + "grad_norm": 0.005480199120938778, + "learning_rate": 0.11602652805132499, + "loss": 0.0718, + "num_input_tokens_seen": 20724096, + "step": 22910 + }, + { + "epoch": 6.047776164708988, + "grad_norm": 0.001723006833344698, + "learning_rate": 0.11596915653930731, + "loss": 0.022, + "num_input_tokens_seen": 20728448, + "step": 22915 + }, + { + "epoch": 6.049095948264485, + "grad_norm": 0.0015531115932390094, + "learning_rate": 0.11591179027527328, + "loss": 0.0505, + "num_input_tokens_seen": 20733248, + "step": 22920 + }, + { + "epoch": 6.050415731819982, + "grad_norm": 0.001900735660456121, + "learning_rate": 0.11585442926806956, + "loss": 0.0497, + "num_input_tokens_seen": 20737632, + "step": 22925 + }, + { + "epoch": 6.051735515375478, + "grad_norm": 0.003001286182552576, + "learning_rate": 0.11579707352654202, + "loss": 0.0712, + "num_input_tokens_seen": 20742176, + "step": 22930 + }, + { + "epoch": 6.053055298930976, + "grad_norm": 0.0007275187526829541, + "learning_rate": 0.11573972305953548, + "loss": 0.0222, + "num_input_tokens_seen": 20746752, + "step": 22935 + }, + { + "epoch": 6.054375082486472, + "grad_norm": 0.004915256518870592, + "learning_rate": 0.11568237787589426, + "loss": 0.0546, + "num_input_tokens_seen": 20751424, + "step": 22940 + }, + { + "epoch": 6.055694866041969, + "grad_norm": 0.0006938359001651406, + "learning_rate": 0.11562503798446161, + "loss": 0.036, + "num_input_tokens_seen": 20755808, + "step": 22945 + }, + { + "epoch": 6.057014649597466, + "grad_norm": 0.0005720243789255619, + "learning_rate": 0.11556770339408005, + "loss": 0.0438, + "num_input_tokens_seen": 20760608, + "step": 22950 + }, + { + "epoch": 6.058334433152963, + "grad_norm": 0.0006029666401445866, + "learning_rate": 0.1155103741135914, + "loss": 0.0333, + "num_input_tokens_seen": 20765184, + "step": 22955 + }, + { + "epoch": 6.05965421670846, + "grad_norm": 0.00078401411883533, + "learning_rate": 0.1154530501518364, + "loss": 0.0228, + "num_input_tokens_seen": 20769664, + "step": 22960 + }, + { + "epoch": 6.060974000263957, + "grad_norm": 0.0029301561880856752, + "learning_rate": 0.11539573151765523, + "loss": 0.047, + "num_input_tokens_seen": 20774304, + "step": 22965 + }, + { + "epoch": 6.062293783819453, + "grad_norm": 0.003032154403626919, + "learning_rate": 0.11533841821988719, + "loss": 0.0559, + "num_input_tokens_seen": 20779008, + "step": 22970 + }, + { + "epoch": 6.063613567374951, + "grad_norm": 0.0013547531561926007, + "learning_rate": 0.11528111026737059, + "loss": 0.0336, + "num_input_tokens_seen": 20783488, + "step": 22975 + }, + { + "epoch": 6.064933350930447, + "grad_norm": 0.0032384854275733232, + "learning_rate": 0.11522380766894312, + "loss": 0.0632, + "num_input_tokens_seen": 20787968, + "step": 22980 + }, + { + "epoch": 6.066253134485945, + "grad_norm": 0.002567998832091689, + "learning_rate": 0.11516651043344152, + "loss": 0.0229, + "num_input_tokens_seen": 20792352, + "step": 22985 + }, + { + "epoch": 6.067572918041441, + "grad_norm": 0.0007129671867005527, + "learning_rate": 0.11510921856970172, + "loss": 0.0445, + "num_input_tokens_seen": 20796928, + "step": 22990 + }, + { + "epoch": 6.068892701596938, + "grad_norm": 0.001678326865658164, + "learning_rate": 0.11505193208655895, + "loss": 0.0467, + "num_input_tokens_seen": 20801024, + "step": 22995 + }, + { + "epoch": 6.070212485152435, + "grad_norm": 0.003484911983832717, + "learning_rate": 0.11499465099284738, + "loss": 0.0642, + "num_input_tokens_seen": 20805728, + "step": 23000 + }, + { + "epoch": 6.070212485152435, + "eval_loss": 0.09009058773517609, + "eval_runtime": 75.8527, + "eval_samples_per_second": 88.791, + "eval_steps_per_second": 22.201, + "num_input_tokens_seen": 20805728, + "step": 23000 + }, + { + "epoch": 6.071532268707932, + "grad_norm": 0.003403523238375783, + "learning_rate": 0.1149373752974006, + "loss": 0.0382, + "num_input_tokens_seen": 20810304, + "step": 23005 + }, + { + "epoch": 6.072852052263428, + "grad_norm": 0.0014110561460256577, + "learning_rate": 0.11488010500905109, + "loss": 0.0405, + "num_input_tokens_seen": 20815136, + "step": 23010 + }, + { + "epoch": 6.074171835818926, + "grad_norm": 0.0024527248460799456, + "learning_rate": 0.11482284013663077, + "loss": 0.0682, + "num_input_tokens_seen": 20819392, + "step": 23015 + }, + { + "epoch": 6.075491619374422, + "grad_norm": 0.0013511733850464225, + "learning_rate": 0.11476558068897061, + "loss": 0.0315, + "num_input_tokens_seen": 20823936, + "step": 23020 + }, + { + "epoch": 6.07681140292992, + "grad_norm": 0.003668451914563775, + "learning_rate": 0.11470832667490061, + "loss": 0.0594, + "num_input_tokens_seen": 20828608, + "step": 23025 + }, + { + "epoch": 6.078131186485416, + "grad_norm": 0.001970302779227495, + "learning_rate": 0.11465107810325013, + "loss": 0.0707, + "num_input_tokens_seen": 20833216, + "step": 23030 + }, + { + "epoch": 6.079450970040913, + "grad_norm": 0.0036025037989020348, + "learning_rate": 0.11459383498284771, + "loss": 0.0473, + "num_input_tokens_seen": 20837504, + "step": 23035 + }, + { + "epoch": 6.08077075359641, + "grad_norm": 0.0010531177977100015, + "learning_rate": 0.11453659732252082, + "loss": 0.032, + "num_input_tokens_seen": 20841824, + "step": 23040 + }, + { + "epoch": 6.082090537151907, + "grad_norm": 0.0016904957592487335, + "learning_rate": 0.11447936513109633, + "loss": 0.041, + "num_input_tokens_seen": 20846304, + "step": 23045 + }, + { + "epoch": 6.083410320707404, + "grad_norm": 0.002670666202902794, + "learning_rate": 0.11442213841740011, + "loss": 0.0532, + "num_input_tokens_seen": 20850752, + "step": 23050 + }, + { + "epoch": 6.084730104262901, + "grad_norm": 0.0005232024705037475, + "learning_rate": 0.1143649171902572, + "loss": 0.0275, + "num_input_tokens_seen": 20855296, + "step": 23055 + }, + { + "epoch": 6.0860498878183975, + "grad_norm": 0.003181836800649762, + "learning_rate": 0.11430770145849194, + "loss": 0.0539, + "num_input_tokens_seen": 20859904, + "step": 23060 + }, + { + "epoch": 6.087369671373895, + "grad_norm": 0.002637961646541953, + "learning_rate": 0.11425049123092756, + "loss": 0.0532, + "num_input_tokens_seen": 20864832, + "step": 23065 + }, + { + "epoch": 6.0886894549293915, + "grad_norm": 0.004397348966449499, + "learning_rate": 0.11419328651638674, + "loss": 0.0455, + "num_input_tokens_seen": 20869280, + "step": 23070 + }, + { + "epoch": 6.090009238484888, + "grad_norm": 0.002542301779612899, + "learning_rate": 0.11413608732369115, + "loss": 0.0335, + "num_input_tokens_seen": 20874016, + "step": 23075 + }, + { + "epoch": 6.0913290220403855, + "grad_norm": 0.0032645384781062603, + "learning_rate": 0.11407889366166153, + "loss": 0.0758, + "num_input_tokens_seen": 20878528, + "step": 23080 + }, + { + "epoch": 6.092648805595882, + "grad_norm": 0.001014665700495243, + "learning_rate": 0.11402170553911797, + "loss": 0.0504, + "num_input_tokens_seen": 20883072, + "step": 23085 + }, + { + "epoch": 6.0939685891513795, + "grad_norm": 0.001319289207458496, + "learning_rate": 0.11396452296487955, + "loss": 0.0351, + "num_input_tokens_seen": 20887392, + "step": 23090 + }, + { + "epoch": 6.095288372706876, + "grad_norm": 0.004670926835387945, + "learning_rate": 0.11390734594776449, + "loss": 0.0687, + "num_input_tokens_seen": 20891936, + "step": 23095 + }, + { + "epoch": 6.096608156262373, + "grad_norm": 0.0011346215615049005, + "learning_rate": 0.11385017449659031, + "loss": 0.0469, + "num_input_tokens_seen": 20896640, + "step": 23100 + }, + { + "epoch": 6.09792793981787, + "grad_norm": 0.0017075863434001803, + "learning_rate": 0.11379300862017344, + "loss": 0.0601, + "num_input_tokens_seen": 20901344, + "step": 23105 + }, + { + "epoch": 6.099247723373367, + "grad_norm": 0.0026989334728568792, + "learning_rate": 0.11373584832732966, + "loss": 0.0469, + "num_input_tokens_seen": 20906176, + "step": 23110 + }, + { + "epoch": 6.100567506928864, + "grad_norm": 0.0010829734383150935, + "learning_rate": 0.11367869362687386, + "loss": 0.0299, + "num_input_tokens_seen": 20910688, + "step": 23115 + }, + { + "epoch": 6.101887290484361, + "grad_norm": 0.0010431020054966211, + "learning_rate": 0.11362154452761988, + "loss": 0.0681, + "num_input_tokens_seen": 20915200, + "step": 23120 + }, + { + "epoch": 6.103207074039857, + "grad_norm": 0.0014350868295878172, + "learning_rate": 0.11356440103838095, + "loss": 0.0343, + "num_input_tokens_seen": 20920032, + "step": 23125 + }, + { + "epoch": 6.104526857595355, + "grad_norm": 0.00071081513306126, + "learning_rate": 0.11350726316796922, + "loss": 0.0373, + "num_input_tokens_seen": 20924512, + "step": 23130 + }, + { + "epoch": 6.105846641150851, + "grad_norm": 0.0007616475340910256, + "learning_rate": 0.11345013092519607, + "loss": 0.045, + "num_input_tokens_seen": 20929056, + "step": 23135 + }, + { + "epoch": 6.107166424706348, + "grad_norm": 0.0047723352909088135, + "learning_rate": 0.11339300431887213, + "loss": 0.0759, + "num_input_tokens_seen": 20933344, + "step": 23140 + }, + { + "epoch": 6.108486208261845, + "grad_norm": 0.0037293636705726385, + "learning_rate": 0.11333588335780687, + "loss": 0.0446, + "num_input_tokens_seen": 20937824, + "step": 23145 + }, + { + "epoch": 6.109805991817342, + "grad_norm": 0.0023727314546704292, + "learning_rate": 0.11327876805080916, + "loss": 0.0609, + "num_input_tokens_seen": 20942464, + "step": 23150 + }, + { + "epoch": 6.111125775372839, + "grad_norm": 0.0017602529842406511, + "learning_rate": 0.11322165840668696, + "loss": 0.0562, + "num_input_tokens_seen": 20946912, + "step": 23155 + }, + { + "epoch": 6.112445558928336, + "grad_norm": 0.002886259462684393, + "learning_rate": 0.11316455443424717, + "loss": 0.0449, + "num_input_tokens_seen": 20951456, + "step": 23160 + }, + { + "epoch": 6.113765342483832, + "grad_norm": 0.0024074942339211702, + "learning_rate": 0.11310745614229603, + "loss": 0.0386, + "num_input_tokens_seen": 20955968, + "step": 23165 + }, + { + "epoch": 6.11508512603933, + "grad_norm": 0.0015485434560105205, + "learning_rate": 0.1130503635396387, + "loss": 0.0413, + "num_input_tokens_seen": 20960352, + "step": 23170 + }, + { + "epoch": 6.116404909594826, + "grad_norm": 0.0011860980885103345, + "learning_rate": 0.11299327663507966, + "loss": 0.0471, + "num_input_tokens_seen": 20964736, + "step": 23175 + }, + { + "epoch": 6.117724693150324, + "grad_norm": 0.0007679357076995075, + "learning_rate": 0.11293619543742246, + "loss": 0.055, + "num_input_tokens_seen": 20968960, + "step": 23180 + }, + { + "epoch": 6.11904447670582, + "grad_norm": 0.0006080842576920986, + "learning_rate": 0.11287911995546965, + "loss": 0.0408, + "num_input_tokens_seen": 20973440, + "step": 23185 + }, + { + "epoch": 6.120364260261317, + "grad_norm": 0.0007802638574503362, + "learning_rate": 0.11282205019802308, + "loss": 0.059, + "num_input_tokens_seen": 20977824, + "step": 23190 + }, + { + "epoch": 6.121684043816814, + "grad_norm": 0.002667314140126109, + "learning_rate": 0.11276498617388354, + "loss": 0.0376, + "num_input_tokens_seen": 20982336, + "step": 23195 + }, + { + "epoch": 6.123003827372311, + "grad_norm": 0.004093896131962538, + "learning_rate": 0.11270792789185109, + "loss": 0.0368, + "num_input_tokens_seen": 20986976, + "step": 23200 + }, + { + "epoch": 6.123003827372311, + "eval_loss": 0.09054750204086304, + "eval_runtime": 75.907, + "eval_samples_per_second": 88.727, + "eval_steps_per_second": 22.185, + "num_input_tokens_seen": 20986976, + "step": 23200 + }, + { + "epoch": 6.124323610927807, + "grad_norm": 0.0030616559088230133, + "learning_rate": 0.11265087536072482, + "loss": 0.0752, + "num_input_tokens_seen": 20991552, + "step": 23205 + }, + { + "epoch": 6.125643394483305, + "grad_norm": 0.0015502764144912362, + "learning_rate": 0.11259382858930288, + "loss": 0.0248, + "num_input_tokens_seen": 20995872, + "step": 23210 + }, + { + "epoch": 6.126963178038801, + "grad_norm": 0.0018607565434649587, + "learning_rate": 0.11253678758638262, + "loss": 0.029, + "num_input_tokens_seen": 21000480, + "step": 23215 + }, + { + "epoch": 6.128282961594299, + "grad_norm": 0.002047445857897401, + "learning_rate": 0.11247975236076059, + "loss": 0.0388, + "num_input_tokens_seen": 21004864, + "step": 23220 + }, + { + "epoch": 6.129602745149795, + "grad_norm": 0.0008093093056231737, + "learning_rate": 0.11242272292123218, + "loss": 0.0134, + "num_input_tokens_seen": 21009600, + "step": 23225 + }, + { + "epoch": 6.130922528705292, + "grad_norm": 0.0032880757935345173, + "learning_rate": 0.11236569927659217, + "loss": 0.0524, + "num_input_tokens_seen": 21014176, + "step": 23230 + }, + { + "epoch": 6.132242312260789, + "grad_norm": 0.003237304510548711, + "learning_rate": 0.11230868143563429, + "loss": 0.0577, + "num_input_tokens_seen": 21018816, + "step": 23235 + }, + { + "epoch": 6.133562095816286, + "grad_norm": 0.00291108014062047, + "learning_rate": 0.11225166940715131, + "loss": 0.0665, + "num_input_tokens_seen": 21023520, + "step": 23240 + }, + { + "epoch": 6.134881879371783, + "grad_norm": 0.0009082756005227566, + "learning_rate": 0.11219466319993537, + "loss": 0.0596, + "num_input_tokens_seen": 21028288, + "step": 23245 + }, + { + "epoch": 6.13620166292728, + "grad_norm": 0.0007905740640126169, + "learning_rate": 0.11213766282277739, + "loss": 0.0262, + "num_input_tokens_seen": 21032704, + "step": 23250 + }, + { + "epoch": 6.1375214464827765, + "grad_norm": 0.0012900447472929955, + "learning_rate": 0.11208066828446761, + "loss": 0.0269, + "num_input_tokens_seen": 21037344, + "step": 23255 + }, + { + "epoch": 6.138841230038274, + "grad_norm": 0.00043856920092366636, + "learning_rate": 0.11202367959379537, + "loss": 0.0199, + "num_input_tokens_seen": 21041920, + "step": 23260 + }, + { + "epoch": 6.1401610135937705, + "grad_norm": 0.0004692485381383449, + "learning_rate": 0.11196669675954894, + "loss": 0.0389, + "num_input_tokens_seen": 21046496, + "step": 23265 + }, + { + "epoch": 6.141480797149267, + "grad_norm": 0.0012855029199272394, + "learning_rate": 0.1119097197905158, + "loss": 0.0764, + "num_input_tokens_seen": 21051168, + "step": 23270 + }, + { + "epoch": 6.1428005807047645, + "grad_norm": 0.004095522221177816, + "learning_rate": 0.11185274869548259, + "loss": 0.0441, + "num_input_tokens_seen": 21055296, + "step": 23275 + }, + { + "epoch": 6.144120364260261, + "grad_norm": 0.00017577772086951882, + "learning_rate": 0.11179578348323486, + "loss": 0.0522, + "num_input_tokens_seen": 21059872, + "step": 23280 + }, + { + "epoch": 6.1454401478157585, + "grad_norm": 0.002211606362834573, + "learning_rate": 0.1117388241625575, + "loss": 0.0446, + "num_input_tokens_seen": 21064512, + "step": 23285 + }, + { + "epoch": 6.146759931371255, + "grad_norm": 0.002986730309203267, + "learning_rate": 0.11168187074223421, + "loss": 0.043, + "num_input_tokens_seen": 21068800, + "step": 23290 + }, + { + "epoch": 6.148079714926752, + "grad_norm": 0.0012578495079651475, + "learning_rate": 0.11162492323104796, + "loss": 0.0449, + "num_input_tokens_seen": 21073536, + "step": 23295 + }, + { + "epoch": 6.149399498482249, + "grad_norm": 0.0015735164051875472, + "learning_rate": 0.11156798163778091, + "loss": 0.0333, + "num_input_tokens_seen": 21077888, + "step": 23300 + }, + { + "epoch": 6.150719282037746, + "grad_norm": 0.0016391031676903367, + "learning_rate": 0.11151104597121399, + "loss": 0.0654, + "num_input_tokens_seen": 21082144, + "step": 23305 + }, + { + "epoch": 6.152039065593243, + "grad_norm": 0.0014839692739769816, + "learning_rate": 0.11145411624012742, + "loss": 0.0482, + "num_input_tokens_seen": 21086816, + "step": 23310 + }, + { + "epoch": 6.15335884914874, + "grad_norm": 0.001229755231179297, + "learning_rate": 0.11139719245330063, + "loss": 0.0301, + "num_input_tokens_seen": 21090944, + "step": 23315 + }, + { + "epoch": 6.154678632704236, + "grad_norm": 0.00013179477537050843, + "learning_rate": 0.11134027461951179, + "loss": 0.0422, + "num_input_tokens_seen": 21095680, + "step": 23320 + }, + { + "epoch": 6.155998416259734, + "grad_norm": 0.0027457838878035545, + "learning_rate": 0.11128336274753849, + "loss": 0.0508, + "num_input_tokens_seen": 21100256, + "step": 23325 + }, + { + "epoch": 6.15731819981523, + "grad_norm": 0.0006087552756071091, + "learning_rate": 0.11122645684615715, + "loss": 0.0416, + "num_input_tokens_seen": 21104896, + "step": 23330 + }, + { + "epoch": 6.158637983370728, + "grad_norm": 0.0019505429081618786, + "learning_rate": 0.11116955692414345, + "loss": 0.0424, + "num_input_tokens_seen": 21109760, + "step": 23335 + }, + { + "epoch": 6.159957766926224, + "grad_norm": 0.0005173974786885083, + "learning_rate": 0.11111266299027203, + "loss": 0.0615, + "num_input_tokens_seen": 21113952, + "step": 23340 + }, + { + "epoch": 6.161277550481721, + "grad_norm": 0.0020913437474519014, + "learning_rate": 0.11105577505331668, + "loss": 0.0513, + "num_input_tokens_seen": 21118528, + "step": 23345 + }, + { + "epoch": 6.162597334037218, + "grad_norm": 0.00264470255933702, + "learning_rate": 0.11099889312205018, + "loss": 0.069, + "num_input_tokens_seen": 21123136, + "step": 23350 + }, + { + "epoch": 6.163917117592715, + "grad_norm": 0.00025170171284116805, + "learning_rate": 0.11094201720524455, + "loss": 0.0275, + "num_input_tokens_seen": 21127232, + "step": 23355 + }, + { + "epoch": 6.165236901148211, + "grad_norm": 0.0015484357718378305, + "learning_rate": 0.11088514731167064, + "loss": 0.0491, + "num_input_tokens_seen": 21132000, + "step": 23360 + }, + { + "epoch": 6.166556684703709, + "grad_norm": 0.002369972877204418, + "learning_rate": 0.11082828345009862, + "loss": 0.0907, + "num_input_tokens_seen": 21136608, + "step": 23365 + }, + { + "epoch": 6.167876468259205, + "grad_norm": 0.002129550324752927, + "learning_rate": 0.11077142562929748, + "loss": 0.0304, + "num_input_tokens_seen": 21140896, + "step": 23370 + }, + { + "epoch": 6.169196251814703, + "grad_norm": 0.0036944474559277296, + "learning_rate": 0.11071457385803554, + "loss": 0.0418, + "num_input_tokens_seen": 21145632, + "step": 23375 + }, + { + "epoch": 6.170516035370199, + "grad_norm": 0.0004809470265172422, + "learning_rate": 0.11065772814508001, + "loss": 0.0282, + "num_input_tokens_seen": 21150208, + "step": 23380 + }, + { + "epoch": 6.171835818925696, + "grad_norm": 0.0005954450462013483, + "learning_rate": 0.11060088849919715, + "loss": 0.0593, + "num_input_tokens_seen": 21154592, + "step": 23385 + }, + { + "epoch": 6.173155602481193, + "grad_norm": 0.0031670760363340378, + "learning_rate": 0.11054405492915244, + "loss": 0.0671, + "num_input_tokens_seen": 21159072, + "step": 23390 + }, + { + "epoch": 6.17447538603669, + "grad_norm": 0.003784921020269394, + "learning_rate": 0.11048722744371031, + "loss": 0.062, + "num_input_tokens_seen": 21163424, + "step": 23395 + }, + { + "epoch": 6.175795169592186, + "grad_norm": 0.0012437945697456598, + "learning_rate": 0.1104304060516342, + "loss": 0.0453, + "num_input_tokens_seen": 21167808, + "step": 23400 + }, + { + "epoch": 6.175795169592186, + "eval_loss": 0.08905886113643646, + "eval_runtime": 75.8389, + "eval_samples_per_second": 88.807, + "eval_steps_per_second": 22.205, + "num_input_tokens_seen": 21167808, + "step": 23400 + }, + { + "epoch": 6.177114953147684, + "grad_norm": 0.000596735393628478, + "learning_rate": 0.11037359076168682, + "loss": 0.0276, + "num_input_tokens_seen": 21172256, + "step": 23405 + }, + { + "epoch": 6.17843473670318, + "grad_norm": 0.0006790168699808419, + "learning_rate": 0.11031678158262966, + "loss": 0.0532, + "num_input_tokens_seen": 21176672, + "step": 23410 + }, + { + "epoch": 6.179754520258678, + "grad_norm": 0.0009947441285476089, + "learning_rate": 0.11025997852322349, + "loss": 0.0393, + "num_input_tokens_seen": 21181504, + "step": 23415 + }, + { + "epoch": 6.181074303814174, + "grad_norm": 0.00441000098362565, + "learning_rate": 0.11020318159222807, + "loss": 0.0665, + "num_input_tokens_seen": 21185696, + "step": 23420 + }, + { + "epoch": 6.182394087369671, + "grad_norm": 0.0019650415051728487, + "learning_rate": 0.1101463907984021, + "loss": 0.06, + "num_input_tokens_seen": 21190208, + "step": 23425 + }, + { + "epoch": 6.183713870925168, + "grad_norm": 0.005074616521596909, + "learning_rate": 0.11008960615050352, + "loss": 0.0692, + "num_input_tokens_seen": 21194976, + "step": 23430 + }, + { + "epoch": 6.185033654480665, + "grad_norm": 0.0006065780762583017, + "learning_rate": 0.11003282765728925, + "loss": 0.03, + "num_input_tokens_seen": 21199872, + "step": 23435 + }, + { + "epoch": 6.1863534380361624, + "grad_norm": 0.0009758140658959746, + "learning_rate": 0.10997605532751518, + "loss": 0.0435, + "num_input_tokens_seen": 21204288, + "step": 23440 + }, + { + "epoch": 6.187673221591659, + "grad_norm": 0.0028909947723150253, + "learning_rate": 0.1099192891699364, + "loss": 0.0317, + "num_input_tokens_seen": 21208992, + "step": 23445 + }, + { + "epoch": 6.188993005147156, + "grad_norm": 0.003925338387489319, + "learning_rate": 0.10986252919330687, + "loss": 0.0351, + "num_input_tokens_seen": 21213312, + "step": 23450 + }, + { + "epoch": 6.190312788702653, + "grad_norm": 0.0015099556185305119, + "learning_rate": 0.10980577540637973, + "loss": 0.053, + "num_input_tokens_seen": 21217920, + "step": 23455 + }, + { + "epoch": 6.19163257225815, + "grad_norm": 0.0013178555527701974, + "learning_rate": 0.10974902781790719, + "loss": 0.0277, + "num_input_tokens_seen": 21222560, + "step": 23460 + }, + { + "epoch": 6.192952355813647, + "grad_norm": 0.0030545752961188555, + "learning_rate": 0.10969228643664032, + "loss": 0.0527, + "num_input_tokens_seen": 21227392, + "step": 23465 + }, + { + "epoch": 6.194272139369144, + "grad_norm": 0.003299623029306531, + "learning_rate": 0.10963555127132942, + "loss": 0.0757, + "num_input_tokens_seen": 21231808, + "step": 23470 + }, + { + "epoch": 6.19559192292464, + "grad_norm": 0.002927149645984173, + "learning_rate": 0.10957882233072382, + "loss": 0.0453, + "num_input_tokens_seen": 21236352, + "step": 23475 + }, + { + "epoch": 6.196911706480138, + "grad_norm": 0.0021362202242016792, + "learning_rate": 0.10952209962357176, + "loss": 0.0372, + "num_input_tokens_seen": 21240704, + "step": 23480 + }, + { + "epoch": 6.198231490035634, + "grad_norm": 0.0002790578582789749, + "learning_rate": 0.10946538315862062, + "loss": 0.0188, + "num_input_tokens_seen": 21245088, + "step": 23485 + }, + { + "epoch": 6.199551273591131, + "grad_norm": 0.0012829635525122285, + "learning_rate": 0.10940867294461679, + "loss": 0.0331, + "num_input_tokens_seen": 21249696, + "step": 23490 + }, + { + "epoch": 6.200871057146628, + "grad_norm": 0.0032722828909754753, + "learning_rate": 0.10935196899030565, + "loss": 0.0811, + "num_input_tokens_seen": 21253952, + "step": 23495 + }, + { + "epoch": 6.202190840702125, + "grad_norm": 0.0015553837874904275, + "learning_rate": 0.10929527130443177, + "loss": 0.0426, + "num_input_tokens_seen": 21258464, + "step": 23500 + }, + { + "epoch": 6.203510624257622, + "grad_norm": 0.0022315902169793844, + "learning_rate": 0.1092385798957385, + "loss": 0.0504, + "num_input_tokens_seen": 21263104, + "step": 23505 + }, + { + "epoch": 6.204830407813119, + "grad_norm": 0.0025488603860139847, + "learning_rate": 0.10918189477296848, + "loss": 0.0576, + "num_input_tokens_seen": 21267616, + "step": 23510 + }, + { + "epoch": 6.206150191368615, + "grad_norm": 0.002367062261328101, + "learning_rate": 0.1091252159448633, + "loss": 0.0379, + "num_input_tokens_seen": 21272416, + "step": 23515 + }, + { + "epoch": 6.207469974924113, + "grad_norm": 0.003145731519907713, + "learning_rate": 0.10906854342016345, + "loss": 0.0556, + "num_input_tokens_seen": 21276640, + "step": 23520 + }, + { + "epoch": 6.208789758479609, + "grad_norm": 0.0005125877214595675, + "learning_rate": 0.10901187720760858, + "loss": 0.0258, + "num_input_tokens_seen": 21280960, + "step": 23525 + }, + { + "epoch": 6.210109542035106, + "grad_norm": 0.0022163044195622206, + "learning_rate": 0.10895521731593734, + "loss": 0.0474, + "num_input_tokens_seen": 21285568, + "step": 23530 + }, + { + "epoch": 6.211429325590603, + "grad_norm": 0.003927480895072222, + "learning_rate": 0.10889856375388733, + "loss": 0.0594, + "num_input_tokens_seen": 21289856, + "step": 23535 + }, + { + "epoch": 6.2127491091461, + "grad_norm": 0.0011351245921105146, + "learning_rate": 0.1088419165301954, + "loss": 0.0439, + "num_input_tokens_seen": 21294752, + "step": 23540 + }, + { + "epoch": 6.214068892701597, + "grad_norm": 0.0024218857288360596, + "learning_rate": 0.1087852756535971, + "loss": 0.0386, + "num_input_tokens_seen": 21299168, + "step": 23545 + }, + { + "epoch": 6.215388676257094, + "grad_norm": 0.0038416164461523294, + "learning_rate": 0.10872864113282725, + "loss": 0.0508, + "num_input_tokens_seen": 21303616, + "step": 23550 + }, + { + "epoch": 6.21670845981259, + "grad_norm": 0.0015450188657268882, + "learning_rate": 0.10867201297661958, + "loss": 0.0385, + "num_input_tokens_seen": 21308384, + "step": 23555 + }, + { + "epoch": 6.218028243368088, + "grad_norm": 0.0018420170526951551, + "learning_rate": 0.10861539119370689, + "loss": 0.0425, + "num_input_tokens_seen": 21312704, + "step": 23560 + }, + { + "epoch": 6.219348026923584, + "grad_norm": 0.001528171356767416, + "learning_rate": 0.10855877579282096, + "loss": 0.0373, + "num_input_tokens_seen": 21317280, + "step": 23565 + }, + { + "epoch": 6.220667810479082, + "grad_norm": 0.00256520533002913, + "learning_rate": 0.10850216678269252, + "loss": 0.0562, + "num_input_tokens_seen": 21321664, + "step": 23570 + }, + { + "epoch": 6.221987594034578, + "grad_norm": 0.00027072045486420393, + "learning_rate": 0.10844556417205146, + "loss": 0.0262, + "num_input_tokens_seen": 21326272, + "step": 23575 + }, + { + "epoch": 6.223307377590075, + "grad_norm": 0.0033120487350970507, + "learning_rate": 0.10838896796962669, + "loss": 0.0897, + "num_input_tokens_seen": 21330720, + "step": 23580 + }, + { + "epoch": 6.224627161145572, + "grad_norm": 0.003809854155406356, + "learning_rate": 0.1083323781841459, + "loss": 0.0738, + "num_input_tokens_seen": 21335296, + "step": 23585 + }, + { + "epoch": 6.225946944701069, + "grad_norm": 0.0018285451224073768, + "learning_rate": 0.10827579482433607, + "loss": 0.0668, + "num_input_tokens_seen": 21339936, + "step": 23590 + }, + { + "epoch": 6.227266728256566, + "grad_norm": 0.002441893331706524, + "learning_rate": 0.10821921789892304, + "loss": 0.0367, + "num_input_tokens_seen": 21344608, + "step": 23595 + }, + { + "epoch": 6.228586511812063, + "grad_norm": 0.0008672435651533306, + "learning_rate": 0.10816264741663158, + "loss": 0.0535, + "num_input_tokens_seen": 21349184, + "step": 23600 + }, + { + "epoch": 6.228586511812063, + "eval_loss": 0.09176187217235565, + "eval_runtime": 75.9905, + "eval_samples_per_second": 88.629, + "eval_steps_per_second": 22.161, + "num_input_tokens_seen": 21349184, + "step": 23600 + }, + { + "epoch": 6.2299062953675595, + "grad_norm": 0.0004951859009452164, + "learning_rate": 0.10810608338618573, + "loss": 0.0444, + "num_input_tokens_seen": 21353632, + "step": 23605 + }, + { + "epoch": 6.231226078923057, + "grad_norm": 0.0033590570092201233, + "learning_rate": 0.10804952581630821, + "loss": 0.0472, + "num_input_tokens_seen": 21358400, + "step": 23610 + }, + { + "epoch": 6.2325458624785535, + "grad_norm": 0.0007746741757728159, + "learning_rate": 0.10799297471572102, + "loss": 0.0438, + "num_input_tokens_seen": 21362848, + "step": 23615 + }, + { + "epoch": 6.23386564603405, + "grad_norm": 0.000758139998652041, + "learning_rate": 0.10793643009314507, + "loss": 0.0335, + "num_input_tokens_seen": 21367648, + "step": 23620 + }, + { + "epoch": 6.2351854295895475, + "grad_norm": 0.0007904049707576632, + "learning_rate": 0.10787989195730015, + "loss": 0.0306, + "num_input_tokens_seen": 21372096, + "step": 23625 + }, + { + "epoch": 6.236505213145044, + "grad_norm": 0.0011466526193544269, + "learning_rate": 0.10782336031690525, + "loss": 0.058, + "num_input_tokens_seen": 21376480, + "step": 23630 + }, + { + "epoch": 6.2378249967005415, + "grad_norm": 0.0023672215174883604, + "learning_rate": 0.10776683518067821, + "loss": 0.0574, + "num_input_tokens_seen": 21381152, + "step": 23635 + }, + { + "epoch": 6.239144780256038, + "grad_norm": 0.004928221460431814, + "learning_rate": 0.10771031655733587, + "loss": 0.0531, + "num_input_tokens_seen": 21385568, + "step": 23640 + }, + { + "epoch": 6.240464563811535, + "grad_norm": 0.000377417920390144, + "learning_rate": 0.10765380445559422, + "loss": 0.0167, + "num_input_tokens_seen": 21390208, + "step": 23645 + }, + { + "epoch": 6.241784347367032, + "grad_norm": 0.0016388630028814077, + "learning_rate": 0.10759729888416801, + "loss": 0.0646, + "num_input_tokens_seen": 21394688, + "step": 23650 + }, + { + "epoch": 6.243104130922529, + "grad_norm": 0.0016155876219272614, + "learning_rate": 0.10754079985177119, + "loss": 0.0456, + "num_input_tokens_seen": 21399296, + "step": 23655 + }, + { + "epoch": 6.244423914478026, + "grad_norm": 0.00419123750180006, + "learning_rate": 0.10748430736711667, + "loss": 0.1216, + "num_input_tokens_seen": 21404096, + "step": 23660 + }, + { + "epoch": 6.245743698033523, + "grad_norm": 0.002528657903894782, + "learning_rate": 0.10742782143891623, + "loss": 0.0351, + "num_input_tokens_seen": 21408640, + "step": 23665 + }, + { + "epoch": 6.247063481589019, + "grad_norm": 0.002005211776122451, + "learning_rate": 0.10737134207588069, + "loss": 0.0367, + "num_input_tokens_seen": 21412992, + "step": 23670 + }, + { + "epoch": 6.248383265144517, + "grad_norm": 0.0030990035738795996, + "learning_rate": 0.10731486928671992, + "loss": 0.0447, + "num_input_tokens_seen": 21417184, + "step": 23675 + }, + { + "epoch": 6.249703048700013, + "grad_norm": 0.0038860614877194166, + "learning_rate": 0.10725840308014269, + "loss": 0.0929, + "num_input_tokens_seen": 21421408, + "step": 23680 + }, + { + "epoch": 6.25102283225551, + "grad_norm": 0.0006170970154926181, + "learning_rate": 0.10720194346485688, + "loss": 0.0558, + "num_input_tokens_seen": 21425696, + "step": 23685 + }, + { + "epoch": 6.252342615811007, + "grad_norm": 0.0019431666005402803, + "learning_rate": 0.10714549044956918, + "loss": 0.0231, + "num_input_tokens_seen": 21430368, + "step": 23690 + }, + { + "epoch": 6.253662399366504, + "grad_norm": 0.0019702883437275887, + "learning_rate": 0.10708904404298542, + "loss": 0.0475, + "num_input_tokens_seen": 21434656, + "step": 23695 + }, + { + "epoch": 6.254982182922001, + "grad_norm": 0.0006294108461588621, + "learning_rate": 0.1070326042538103, + "loss": 0.0247, + "num_input_tokens_seen": 21439104, + "step": 23700 + }, + { + "epoch": 6.256301966477498, + "grad_norm": 0.000548355805221945, + "learning_rate": 0.10697617109074758, + "loss": 0.033, + "num_input_tokens_seen": 21443744, + "step": 23705 + }, + { + "epoch": 6.257621750032994, + "grad_norm": 0.004131329711526632, + "learning_rate": 0.10691974456249999, + "loss": 0.0614, + "num_input_tokens_seen": 21448352, + "step": 23710 + }, + { + "epoch": 6.258941533588492, + "grad_norm": 0.0016498406184837222, + "learning_rate": 0.10686332467776909, + "loss": 0.0436, + "num_input_tokens_seen": 21452768, + "step": 23715 + }, + { + "epoch": 6.260261317143988, + "grad_norm": 0.0014972806675359607, + "learning_rate": 0.10680691144525563, + "loss": 0.0537, + "num_input_tokens_seen": 21457152, + "step": 23720 + }, + { + "epoch": 6.261581100699486, + "grad_norm": 0.003703376743942499, + "learning_rate": 0.10675050487365928, + "loss": 0.0596, + "num_input_tokens_seen": 21461280, + "step": 23725 + }, + { + "epoch": 6.262900884254982, + "grad_norm": 0.0037266737781465054, + "learning_rate": 0.10669410497167851, + "loss": 0.0691, + "num_input_tokens_seen": 21465600, + "step": 23730 + }, + { + "epoch": 6.264220667810479, + "grad_norm": 0.004408944398164749, + "learning_rate": 0.10663771174801102, + "loss": 0.0995, + "num_input_tokens_seen": 21470016, + "step": 23735 + }, + { + "epoch": 6.265540451365976, + "grad_norm": 0.0037519228644669056, + "learning_rate": 0.10658132521135329, + "loss": 0.0658, + "num_input_tokens_seen": 21474752, + "step": 23740 + }, + { + "epoch": 6.266860234921473, + "grad_norm": 0.0019889913965016603, + "learning_rate": 0.10652494537040084, + "loss": 0.0414, + "num_input_tokens_seen": 21479392, + "step": 23745 + }, + { + "epoch": 6.268180018476969, + "grad_norm": 0.0007012595306150615, + "learning_rate": 0.1064685722338482, + "loss": 0.0482, + "num_input_tokens_seen": 21483840, + "step": 23750 + }, + { + "epoch": 6.269499802032467, + "grad_norm": 0.0004499455390032381, + "learning_rate": 0.10641220581038871, + "loss": 0.0173, + "num_input_tokens_seen": 21488256, + "step": 23755 + }, + { + "epoch": 6.270819585587963, + "grad_norm": 0.003525102511048317, + "learning_rate": 0.10635584610871483, + "loss": 0.047, + "num_input_tokens_seen": 21492992, + "step": 23760 + }, + { + "epoch": 6.272139369143461, + "grad_norm": 0.0004929761635139585, + "learning_rate": 0.10629949313751803, + "loss": 0.0477, + "num_input_tokens_seen": 21497728, + "step": 23765 + }, + { + "epoch": 6.273459152698957, + "grad_norm": 0.002406513085588813, + "learning_rate": 0.10624314690548849, + "loss": 0.0576, + "num_input_tokens_seen": 21502432, + "step": 23770 + }, + { + "epoch": 6.274778936254454, + "grad_norm": 0.0006044461042620242, + "learning_rate": 0.1061868074213156, + "loss": 0.0337, + "num_input_tokens_seen": 21507008, + "step": 23775 + }, + { + "epoch": 6.276098719809951, + "grad_norm": 0.0006206091493368149, + "learning_rate": 0.10613047469368765, + "loss": 0.0275, + "num_input_tokens_seen": 21511424, + "step": 23780 + }, + { + "epoch": 6.277418503365448, + "grad_norm": 0.004379145801067352, + "learning_rate": 0.10607414873129171, + "loss": 0.037, + "num_input_tokens_seen": 21515808, + "step": 23785 + }, + { + "epoch": 6.278738286920945, + "grad_norm": 0.00022266509768087417, + "learning_rate": 0.10601782954281413, + "loss": 0.0298, + "num_input_tokens_seen": 21520320, + "step": 23790 + }, + { + "epoch": 6.280058070476442, + "grad_norm": 0.002262836555019021, + "learning_rate": 0.1059615171369399, + "loss": 0.0349, + "num_input_tokens_seen": 21524800, + "step": 23795 + }, + { + "epoch": 6.2813778540319385, + "grad_norm": 0.002843257039785385, + "learning_rate": 0.10590521152235312, + "loss": 0.0355, + "num_input_tokens_seen": 21529376, + "step": 23800 + }, + { + "epoch": 6.2813778540319385, + "eval_loss": 0.09450829029083252, + "eval_runtime": 75.8277, + "eval_samples_per_second": 88.82, + "eval_steps_per_second": 22.208, + "num_input_tokens_seen": 21529376, + "step": 23800 + }, + { + "epoch": 6.282697637587436, + "grad_norm": 0.0023358738981187344, + "learning_rate": 0.1058489127077369, + "loss": 0.0832, + "num_input_tokens_seen": 21533856, + "step": 23805 + }, + { + "epoch": 6.2840174211429325, + "grad_norm": 0.001038072514347732, + "learning_rate": 0.1057926207017732, + "loss": 0.0603, + "num_input_tokens_seen": 21538400, + "step": 23810 + }, + { + "epoch": 6.285337204698429, + "grad_norm": 0.004414190072566271, + "learning_rate": 0.10573633551314285, + "loss": 0.0655, + "num_input_tokens_seen": 21542976, + "step": 23815 + }, + { + "epoch": 6.2866569882539265, + "grad_norm": 0.0011593776289373636, + "learning_rate": 0.1056800571505259, + "loss": 0.0373, + "num_input_tokens_seen": 21547776, + "step": 23820 + }, + { + "epoch": 6.287976771809423, + "grad_norm": 0.004129400942474604, + "learning_rate": 0.10562378562260105, + "loss": 0.0427, + "num_input_tokens_seen": 21552320, + "step": 23825 + }, + { + "epoch": 6.2892965553649205, + "grad_norm": 0.0008213675464503467, + "learning_rate": 0.10556752093804615, + "loss": 0.037, + "num_input_tokens_seen": 21556832, + "step": 23830 + }, + { + "epoch": 6.290616338920417, + "grad_norm": 0.002551118843257427, + "learning_rate": 0.10551126310553786, + "loss": 0.0564, + "num_input_tokens_seen": 21561472, + "step": 23835 + }, + { + "epoch": 6.291936122475914, + "grad_norm": 0.0029281671158969402, + "learning_rate": 0.10545501213375187, + "loss": 0.0384, + "num_input_tokens_seen": 21566080, + "step": 23840 + }, + { + "epoch": 6.293255906031411, + "grad_norm": 0.0013243198627606034, + "learning_rate": 0.10539876803136287, + "loss": 0.0541, + "num_input_tokens_seen": 21570656, + "step": 23845 + }, + { + "epoch": 6.294575689586908, + "grad_norm": 0.0022611322347074747, + "learning_rate": 0.10534253080704428, + "loss": 0.0366, + "num_input_tokens_seen": 21575072, + "step": 23850 + }, + { + "epoch": 6.295895473142405, + "grad_norm": 0.0017783178482204676, + "learning_rate": 0.10528630046946862, + "loss": 0.0514, + "num_input_tokens_seen": 21579680, + "step": 23855 + }, + { + "epoch": 6.297215256697902, + "grad_norm": 0.00486551271751523, + "learning_rate": 0.1052300770273074, + "loss": 0.0534, + "num_input_tokens_seen": 21584448, + "step": 23860 + }, + { + "epoch": 6.298535040253398, + "grad_norm": 0.0031627484131604433, + "learning_rate": 0.10517386048923086, + "loss": 0.057, + "num_input_tokens_seen": 21588928, + "step": 23865 + }, + { + "epoch": 6.299854823808896, + "grad_norm": 0.0018293441971763968, + "learning_rate": 0.10511765086390841, + "loss": 0.0681, + "num_input_tokens_seen": 21593408, + "step": 23870 + }, + { + "epoch": 6.301174607364392, + "grad_norm": 0.0018550140084698796, + "learning_rate": 0.10506144816000816, + "loss": 0.039, + "num_input_tokens_seen": 21597920, + "step": 23875 + }, + { + "epoch": 6.302494390919889, + "grad_norm": 0.0052023353055119514, + "learning_rate": 0.10500525238619736, + "loss": 0.0873, + "num_input_tokens_seen": 21602496, + "step": 23880 + }, + { + "epoch": 6.303814174475386, + "grad_norm": 0.0013900838093832135, + "learning_rate": 0.10494906355114209, + "loss": 0.0356, + "num_input_tokens_seen": 21607136, + "step": 23885 + }, + { + "epoch": 6.305133958030883, + "grad_norm": 0.0014900839887559414, + "learning_rate": 0.10489288166350737, + "loss": 0.0358, + "num_input_tokens_seen": 21611904, + "step": 23890 + }, + { + "epoch": 6.30645374158638, + "grad_norm": 0.001463708933442831, + "learning_rate": 0.10483670673195711, + "loss": 0.0432, + "num_input_tokens_seen": 21616224, + "step": 23895 + }, + { + "epoch": 6.307773525141877, + "grad_norm": 0.0013828485971316695, + "learning_rate": 0.10478053876515431, + "loss": 0.0231, + "num_input_tokens_seen": 21620672, + "step": 23900 + }, + { + "epoch": 6.309093308697373, + "grad_norm": 0.000992145505733788, + "learning_rate": 0.10472437777176061, + "loss": 0.035, + "num_input_tokens_seen": 21625376, + "step": 23905 + }, + { + "epoch": 6.310413092252871, + "grad_norm": 0.001748527749441564, + "learning_rate": 0.1046682237604369, + "loss": 0.0465, + "num_input_tokens_seen": 21629568, + "step": 23910 + }, + { + "epoch": 6.311732875808367, + "grad_norm": 0.0017083758721128106, + "learning_rate": 0.1046120767398427, + "loss": 0.0352, + "num_input_tokens_seen": 21634016, + "step": 23915 + }, + { + "epoch": 6.313052659363865, + "grad_norm": 0.0021609123796224594, + "learning_rate": 0.10455593671863667, + "loss": 0.0307, + "num_input_tokens_seen": 21638496, + "step": 23920 + }, + { + "epoch": 6.314372442919361, + "grad_norm": 0.00285804970189929, + "learning_rate": 0.1044998037054763, + "loss": 0.1128, + "num_input_tokens_seen": 21642944, + "step": 23925 + }, + { + "epoch": 6.315692226474858, + "grad_norm": 0.0027233087457716465, + "learning_rate": 0.10444367770901794, + "loss": 0.0737, + "num_input_tokens_seen": 21647744, + "step": 23930 + }, + { + "epoch": 6.317012010030355, + "grad_norm": 0.0009931487729772925, + "learning_rate": 0.10438755873791698, + "loss": 0.0272, + "num_input_tokens_seen": 21652128, + "step": 23935 + }, + { + "epoch": 6.318331793585852, + "grad_norm": 0.0018094569677487016, + "learning_rate": 0.10433144680082775, + "loss": 0.0545, + "num_input_tokens_seen": 21656672, + "step": 23940 + }, + { + "epoch": 6.319651577141348, + "grad_norm": 0.0014382570516318083, + "learning_rate": 0.10427534190640322, + "loss": 0.0332, + "num_input_tokens_seen": 21661408, + "step": 23945 + }, + { + "epoch": 6.320971360696846, + "grad_norm": 0.001743443077430129, + "learning_rate": 0.10421924406329568, + "loss": 0.0379, + "num_input_tokens_seen": 21665664, + "step": 23950 + }, + { + "epoch": 6.322291144252342, + "grad_norm": 0.0024086381308734417, + "learning_rate": 0.10416315328015598, + "loss": 0.0385, + "num_input_tokens_seen": 21669888, + "step": 23955 + }, + { + "epoch": 6.32361092780784, + "grad_norm": 0.000922562787309289, + "learning_rate": 0.10410706956563402, + "loss": 0.0294, + "num_input_tokens_seen": 21674560, + "step": 23960 + }, + { + "epoch": 6.324930711363336, + "grad_norm": 0.0034454669803380966, + "learning_rate": 0.10405099292837874, + "loss": 0.0366, + "num_input_tokens_seen": 21678912, + "step": 23965 + }, + { + "epoch": 6.326250494918833, + "grad_norm": 0.003165449481457472, + "learning_rate": 0.10399492337703771, + "loss": 0.0338, + "num_input_tokens_seen": 21683392, + "step": 23970 + }, + { + "epoch": 6.32757027847433, + "grad_norm": 0.004478081129491329, + "learning_rate": 0.10393886092025764, + "loss": 0.0619, + "num_input_tokens_seen": 21688224, + "step": 23975 + }, + { + "epoch": 6.328890062029827, + "grad_norm": 0.00632651848718524, + "learning_rate": 0.10388280556668412, + "loss": 0.0757, + "num_input_tokens_seen": 21692576, + "step": 23980 + }, + { + "epoch": 6.330209845585324, + "grad_norm": 0.003771708346903324, + "learning_rate": 0.10382675732496145, + "loss": 0.0526, + "num_input_tokens_seen": 21697088, + "step": 23985 + }, + { + "epoch": 6.331529629140821, + "grad_norm": 0.00018556749273557216, + "learning_rate": 0.10377071620373311, + "loss": 0.0358, + "num_input_tokens_seen": 21701568, + "step": 23990 + }, + { + "epoch": 6.3328494126963175, + "grad_norm": 0.0021493027452379465, + "learning_rate": 0.10371468221164128, + "loss": 0.1016, + "num_input_tokens_seen": 21705920, + "step": 23995 + }, + { + "epoch": 6.334169196251815, + "grad_norm": 0.0014298318419605494, + "learning_rate": 0.10365865535732706, + "loss": 0.018, + "num_input_tokens_seen": 21710304, + "step": 24000 + }, + { + "epoch": 6.334169196251815, + "eval_loss": 0.09202361851930618, + "eval_runtime": 75.8925, + "eval_samples_per_second": 88.744, + "eval_steps_per_second": 22.189, + "num_input_tokens_seen": 21710304, + "step": 24000 + }, + { + "epoch": 6.3354889798073115, + "grad_norm": 0.0019593278411775827, + "learning_rate": 0.10360263564943062, + "loss": 0.1196, + "num_input_tokens_seen": 21715328, + "step": 24005 + }, + { + "epoch": 6.336808763362809, + "grad_norm": 0.000747001264244318, + "learning_rate": 0.10354662309659075, + "loss": 0.0377, + "num_input_tokens_seen": 21719840, + "step": 24010 + }, + { + "epoch": 6.3381285469183055, + "grad_norm": 0.008329817093908787, + "learning_rate": 0.10349061770744537, + "loss": 0.0661, + "num_input_tokens_seen": 21724480, + "step": 24015 + }, + { + "epoch": 6.339448330473802, + "grad_norm": 0.0016959970816969872, + "learning_rate": 0.10343461949063128, + "loss": 0.032, + "num_input_tokens_seen": 21729088, + "step": 24020 + }, + { + "epoch": 6.3407681140292995, + "grad_norm": 0.0036465171724557877, + "learning_rate": 0.103378628454784, + "loss": 0.0711, + "num_input_tokens_seen": 21733472, + "step": 24025 + }, + { + "epoch": 6.342087897584796, + "grad_norm": 0.000750492443330586, + "learning_rate": 0.10332264460853811, + "loss": 0.0291, + "num_input_tokens_seen": 21738336, + "step": 24030 + }, + { + "epoch": 6.343407681140293, + "grad_norm": 0.004315256606787443, + "learning_rate": 0.10326666796052701, + "loss": 0.0456, + "num_input_tokens_seen": 21742816, + "step": 24035 + }, + { + "epoch": 6.34472746469579, + "grad_norm": 0.0008032612968236208, + "learning_rate": 0.10321069851938296, + "loss": 0.0867, + "num_input_tokens_seen": 21747456, + "step": 24040 + }, + { + "epoch": 6.346047248251287, + "grad_norm": 0.0030300705693662167, + "learning_rate": 0.10315473629373724, + "loss": 0.0533, + "num_input_tokens_seen": 21752032, + "step": 24045 + }, + { + "epoch": 6.347367031806784, + "grad_norm": 0.006027003284543753, + "learning_rate": 0.10309878129221982, + "loss": 0.0332, + "num_input_tokens_seen": 21756512, + "step": 24050 + }, + { + "epoch": 6.348686815362281, + "grad_norm": 0.0008871728787198663, + "learning_rate": 0.10304283352345973, + "loss": 0.0576, + "num_input_tokens_seen": 21760864, + "step": 24055 + }, + { + "epoch": 6.350006598917777, + "grad_norm": 0.0024134295526891947, + "learning_rate": 0.10298689299608486, + "loss": 0.0559, + "num_input_tokens_seen": 21765376, + "step": 24060 + }, + { + "epoch": 6.351326382473275, + "grad_norm": 0.0026379157789051533, + "learning_rate": 0.10293095971872188, + "loss": 0.0638, + "num_input_tokens_seen": 21769824, + "step": 24065 + }, + { + "epoch": 6.352646166028771, + "grad_norm": 0.0012642535148188472, + "learning_rate": 0.10287503369999645, + "loss": 0.0394, + "num_input_tokens_seen": 21774400, + "step": 24070 + }, + { + "epoch": 6.353965949584268, + "grad_norm": 0.0010729189962148666, + "learning_rate": 0.10281911494853295, + "loss": 0.0318, + "num_input_tokens_seen": 21778464, + "step": 24075 + }, + { + "epoch": 6.355285733139765, + "grad_norm": 0.002903244225308299, + "learning_rate": 0.10276320347295485, + "loss": 0.0484, + "num_input_tokens_seen": 21782976, + "step": 24080 + }, + { + "epoch": 6.356605516695262, + "grad_norm": 0.0013584946282207966, + "learning_rate": 0.10270729928188446, + "loss": 0.0699, + "num_input_tokens_seen": 21787776, + "step": 24085 + }, + { + "epoch": 6.357925300250759, + "grad_norm": 0.0012567901285365224, + "learning_rate": 0.10265140238394276, + "loss": 0.0698, + "num_input_tokens_seen": 21792352, + "step": 24090 + }, + { + "epoch": 6.359245083806256, + "grad_norm": 0.0016925487434491515, + "learning_rate": 0.10259551278774988, + "loss": 0.0578, + "num_input_tokens_seen": 21797088, + "step": 24095 + }, + { + "epoch": 6.360564867361752, + "grad_norm": 0.0009286084095947444, + "learning_rate": 0.10253963050192462, + "loss": 0.0303, + "num_input_tokens_seen": 21801536, + "step": 24100 + }, + { + "epoch": 6.36188465091725, + "grad_norm": 0.00022825451742392033, + "learning_rate": 0.10248375553508478, + "loss": 0.0402, + "num_input_tokens_seen": 21806176, + "step": 24105 + }, + { + "epoch": 6.363204434472746, + "grad_norm": 0.002956433454528451, + "learning_rate": 0.102427887895847, + "loss": 0.0751, + "num_input_tokens_seen": 21810304, + "step": 24110 + }, + { + "epoch": 6.364524218028244, + "grad_norm": 0.0007039654883556068, + "learning_rate": 0.10237202759282668, + "loss": 0.0551, + "num_input_tokens_seen": 21814720, + "step": 24115 + }, + { + "epoch": 6.36584400158374, + "grad_norm": 0.002112842397764325, + "learning_rate": 0.10231617463463821, + "loss": 0.0689, + "num_input_tokens_seen": 21819424, + "step": 24120 + }, + { + "epoch": 6.367163785139237, + "grad_norm": 0.002551097422838211, + "learning_rate": 0.10226032902989492, + "loss": 0.0913, + "num_input_tokens_seen": 21823872, + "step": 24125 + }, + { + "epoch": 6.368483568694734, + "grad_norm": 0.002646900247782469, + "learning_rate": 0.10220449078720877, + "loss": 0.0292, + "num_input_tokens_seen": 21828416, + "step": 24130 + }, + { + "epoch": 6.369803352250231, + "grad_norm": 0.002057712757959962, + "learning_rate": 0.1021486599151908, + "loss": 0.0783, + "num_input_tokens_seen": 21833184, + "step": 24135 + }, + { + "epoch": 6.371123135805728, + "grad_norm": 0.0016101557994261384, + "learning_rate": 0.10209283642245084, + "loss": 0.0594, + "num_input_tokens_seen": 21837472, + "step": 24140 + }, + { + "epoch": 6.372442919361225, + "grad_norm": 0.002372507005929947, + "learning_rate": 0.10203702031759748, + "loss": 0.0707, + "num_input_tokens_seen": 21841920, + "step": 24145 + }, + { + "epoch": 6.373762702916721, + "grad_norm": 0.0032145229633897543, + "learning_rate": 0.1019812116092384, + "loss": 0.0745, + "num_input_tokens_seen": 21846176, + "step": 24150 + }, + { + "epoch": 6.375082486472219, + "grad_norm": 0.0027485089376568794, + "learning_rate": 0.10192541030597986, + "loss": 0.0364, + "num_input_tokens_seen": 21850368, + "step": 24155 + }, + { + "epoch": 6.376402270027715, + "grad_norm": 0.0015365503495559096, + "learning_rate": 0.1018696164164272, + "loss": 0.0831, + "num_input_tokens_seen": 21854784, + "step": 24160 + }, + { + "epoch": 6.377722053583212, + "grad_norm": 0.001819427008740604, + "learning_rate": 0.10181382994918459, + "loss": 0.0382, + "num_input_tokens_seen": 21859072, + "step": 24165 + }, + { + "epoch": 6.379041837138709, + "grad_norm": 0.0013496143510565162, + "learning_rate": 0.10175805091285492, + "loss": 0.0649, + "num_input_tokens_seen": 21863488, + "step": 24170 + }, + { + "epoch": 6.380361620694206, + "grad_norm": 0.002926003420725465, + "learning_rate": 0.10170227931603999, + "loss": 0.0525, + "num_input_tokens_seen": 21867776, + "step": 24175 + }, + { + "epoch": 6.381681404249703, + "grad_norm": 0.0032727764919400215, + "learning_rate": 0.10164651516734062, + "loss": 0.0569, + "num_input_tokens_seen": 21872256, + "step": 24180 + }, + { + "epoch": 6.3830011878052, + "grad_norm": 0.00194134924095124, + "learning_rate": 0.1015907584753562, + "loss": 0.0642, + "num_input_tokens_seen": 21876928, + "step": 24185 + }, + { + "epoch": 6.3843209713606965, + "grad_norm": 0.0007618240197189152, + "learning_rate": 0.10153500924868523, + "loss": 0.0379, + "num_input_tokens_seen": 21881216, + "step": 24190 + }, + { + "epoch": 6.385640754916194, + "grad_norm": 0.0024072923697531223, + "learning_rate": 0.10147926749592483, + "loss": 0.0668, + "num_input_tokens_seen": 21885344, + "step": 24195 + }, + { + "epoch": 6.3869605384716905, + "grad_norm": 0.0022632861509919167, + "learning_rate": 0.10142353322567112, + "loss": 0.0335, + "num_input_tokens_seen": 21889920, + "step": 24200 + }, + { + "epoch": 6.3869605384716905, + "eval_loss": 0.09064734727144241, + "eval_runtime": 75.9677, + "eval_samples_per_second": 88.656, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 21889920, + "step": 24200 + }, + { + "epoch": 6.388280322027187, + "grad_norm": 0.0018723364919424057, + "learning_rate": 0.1013678064465191, + "loss": 0.0322, + "num_input_tokens_seen": 21894752, + "step": 24205 + }, + { + "epoch": 6.3896001055826845, + "grad_norm": 0.006068951450288296, + "learning_rate": 0.10131208716706244, + "loss": 0.0761, + "num_input_tokens_seen": 21899168, + "step": 24210 + }, + { + "epoch": 6.390919889138181, + "grad_norm": 0.002954683732241392, + "learning_rate": 0.10125637539589379, + "loss": 0.0609, + "num_input_tokens_seen": 21904000, + "step": 24215 + }, + { + "epoch": 6.3922396726936785, + "grad_norm": 0.0023336191661655903, + "learning_rate": 0.10120067114160464, + "loss": 0.0452, + "num_input_tokens_seen": 21908192, + "step": 24220 + }, + { + "epoch": 6.393559456249175, + "grad_norm": 0.00171061628498137, + "learning_rate": 0.10114497441278517, + "loss": 0.0532, + "num_input_tokens_seen": 21912608, + "step": 24225 + }, + { + "epoch": 6.394879239804672, + "grad_norm": 0.0017681996105238795, + "learning_rate": 0.10108928521802468, + "loss": 0.0655, + "num_input_tokens_seen": 21917216, + "step": 24230 + }, + { + "epoch": 6.396199023360169, + "grad_norm": 0.002471918473020196, + "learning_rate": 0.101033603565911, + "loss": 0.0559, + "num_input_tokens_seen": 21921632, + "step": 24235 + }, + { + "epoch": 6.397518806915666, + "grad_norm": 0.004311477765440941, + "learning_rate": 0.10097792946503102, + "loss": 0.0706, + "num_input_tokens_seen": 21926144, + "step": 24240 + }, + { + "epoch": 6.398838590471163, + "grad_norm": 0.005350543651729822, + "learning_rate": 0.10092226292397039, + "loss": 0.084, + "num_input_tokens_seen": 21930912, + "step": 24245 + }, + { + "epoch": 6.40015837402666, + "grad_norm": 0.0022943352814763784, + "learning_rate": 0.10086660395131354, + "loss": 0.0704, + "num_input_tokens_seen": 21935584, + "step": 24250 + }, + { + "epoch": 6.401478157582156, + "grad_norm": 0.0014682278269901872, + "learning_rate": 0.10081095255564385, + "loss": 0.0561, + "num_input_tokens_seen": 21939968, + "step": 24255 + }, + { + "epoch": 6.402797941137654, + "grad_norm": 0.0017435626359656453, + "learning_rate": 0.10075530874554335, + "loss": 0.0491, + "num_input_tokens_seen": 21944448, + "step": 24260 + }, + { + "epoch": 6.40411772469315, + "grad_norm": 0.0018570804968476295, + "learning_rate": 0.10069967252959311, + "loss": 0.046, + "num_input_tokens_seen": 21948960, + "step": 24265 + }, + { + "epoch": 6.405437508248648, + "grad_norm": 0.001048391219228506, + "learning_rate": 0.10064404391637297, + "loss": 0.0508, + "num_input_tokens_seen": 21953184, + "step": 24270 + }, + { + "epoch": 6.406757291804144, + "grad_norm": 0.0018696564948186278, + "learning_rate": 0.10058842291446145, + "loss": 0.0424, + "num_input_tokens_seen": 21957312, + "step": 24275 + }, + { + "epoch": 6.408077075359641, + "grad_norm": 0.0005112666403874755, + "learning_rate": 0.10053280953243608, + "loss": 0.0516, + "num_input_tokens_seen": 21961824, + "step": 24280 + }, + { + "epoch": 6.409396858915138, + "grad_norm": 0.0020189459901303053, + "learning_rate": 0.10047720377887315, + "loss": 0.0355, + "num_input_tokens_seen": 21966080, + "step": 24285 + }, + { + "epoch": 6.410716642470635, + "grad_norm": 0.0013746226904913783, + "learning_rate": 0.10042160566234767, + "loss": 0.0313, + "num_input_tokens_seen": 21970304, + "step": 24290 + }, + { + "epoch": 6.412036426026131, + "grad_norm": 0.002382970415055752, + "learning_rate": 0.10036601519143372, + "loss": 0.041, + "num_input_tokens_seen": 21974496, + "step": 24295 + }, + { + "epoch": 6.413356209581629, + "grad_norm": 0.0010543913813307881, + "learning_rate": 0.1003104323747039, + "loss": 0.0459, + "num_input_tokens_seen": 21979008, + "step": 24300 + }, + { + "epoch": 6.414675993137125, + "grad_norm": 0.002266896888613701, + "learning_rate": 0.10025485722072984, + "loss": 0.0806, + "num_input_tokens_seen": 21983392, + "step": 24305 + }, + { + "epoch": 6.415995776692623, + "grad_norm": 0.0016676454106345773, + "learning_rate": 0.10019928973808201, + "loss": 0.0517, + "num_input_tokens_seen": 21988320, + "step": 24310 + }, + { + "epoch": 6.417315560248119, + "grad_norm": 0.002088225679472089, + "learning_rate": 0.10014372993532945, + "loss": 0.032, + "num_input_tokens_seen": 21992832, + "step": 24315 + }, + { + "epoch": 6.418635343803616, + "grad_norm": 0.004127246793359518, + "learning_rate": 0.1000881778210403, + "loss": 0.0628, + "num_input_tokens_seen": 21997376, + "step": 24320 + }, + { + "epoch": 6.419955127359113, + "grad_norm": 0.005225596949458122, + "learning_rate": 0.10003263340378142, + "loss": 0.0765, + "num_input_tokens_seen": 22001984, + "step": 24325 + }, + { + "epoch": 6.42127491091461, + "grad_norm": 0.0023108678869903088, + "learning_rate": 0.09997709669211834, + "loss": 0.0403, + "num_input_tokens_seen": 22006368, + "step": 24330 + }, + { + "epoch": 6.4225946944701064, + "grad_norm": 0.0010010561672970653, + "learning_rate": 0.0999215676946156, + "loss": 0.0588, + "num_input_tokens_seen": 22010912, + "step": 24335 + }, + { + "epoch": 6.423914478025604, + "grad_norm": 0.0037382475566118956, + "learning_rate": 0.0998660464198364, + "loss": 0.0495, + "num_input_tokens_seen": 22015360, + "step": 24340 + }, + { + "epoch": 6.4252342615811004, + "grad_norm": 0.002338243881240487, + "learning_rate": 0.09981053287634288, + "loss": 0.0181, + "num_input_tokens_seen": 22020064, + "step": 24345 + }, + { + "epoch": 6.426554045136598, + "grad_norm": 0.0013228924944996834, + "learning_rate": 0.09975502707269596, + "loss": 0.0344, + "num_input_tokens_seen": 22024736, + "step": 24350 + }, + { + "epoch": 6.4278738286920944, + "grad_norm": 0.0017264228081330657, + "learning_rate": 0.09969952901745524, + "loss": 0.038, + "num_input_tokens_seen": 22029472, + "step": 24355 + }, + { + "epoch": 6.429193612247591, + "grad_norm": 0.002595573430880904, + "learning_rate": 0.09964403871917925, + "loss": 0.049, + "num_input_tokens_seen": 22033920, + "step": 24360 + }, + { + "epoch": 6.4305133958030885, + "grad_norm": 0.0005027906154282391, + "learning_rate": 0.09958855618642536, + "loss": 0.0418, + "num_input_tokens_seen": 22038464, + "step": 24365 + }, + { + "epoch": 6.431833179358585, + "grad_norm": 0.0015372387133538723, + "learning_rate": 0.09953308142774955, + "loss": 0.0439, + "num_input_tokens_seen": 22043040, + "step": 24370 + }, + { + "epoch": 6.4331529629140825, + "grad_norm": 0.0018927608616650105, + "learning_rate": 0.09947761445170686, + "loss": 0.0645, + "num_input_tokens_seen": 22047552, + "step": 24375 + }, + { + "epoch": 6.434472746469579, + "grad_norm": 0.004471888300031424, + "learning_rate": 0.09942215526685086, + "loss": 0.0677, + "num_input_tokens_seen": 22052160, + "step": 24380 + }, + { + "epoch": 6.435792530025076, + "grad_norm": 0.0023930442985147238, + "learning_rate": 0.09936670388173414, + "loss": 0.0578, + "num_input_tokens_seen": 22056640, + "step": 24385 + }, + { + "epoch": 6.437112313580573, + "grad_norm": 0.003010789630934596, + "learning_rate": 0.09931126030490799, + "loss": 0.0603, + "num_input_tokens_seen": 22061216, + "step": 24390 + }, + { + "epoch": 6.43843209713607, + "grad_norm": 0.0009361966513097286, + "learning_rate": 0.0992558245449225, + "loss": 0.0549, + "num_input_tokens_seen": 22065536, + "step": 24395 + }, + { + "epoch": 6.439751880691567, + "grad_norm": 0.003090451005846262, + "learning_rate": 0.09920039661032651, + "loss": 0.0718, + "num_input_tokens_seen": 22070176, + "step": 24400 + }, + { + "epoch": 6.439751880691567, + "eval_loss": 0.0887286588549614, + "eval_runtime": 75.9703, + "eval_samples_per_second": 88.653, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 22070176, + "step": 24400 + }, + { + "epoch": 6.441071664247064, + "grad_norm": 0.0015785004943609238, + "learning_rate": 0.09914497650966782, + "loss": 0.0503, + "num_input_tokens_seen": 22074272, + "step": 24405 + }, + { + "epoch": 6.44239144780256, + "grad_norm": 0.001453356584534049, + "learning_rate": 0.09908956425149276, + "loss": 0.0447, + "num_input_tokens_seen": 22078592, + "step": 24410 + }, + { + "epoch": 6.443711231358058, + "grad_norm": 0.0012972976546734571, + "learning_rate": 0.09903415984434677, + "loss": 0.059, + "num_input_tokens_seen": 22082752, + "step": 24415 + }, + { + "epoch": 6.445031014913554, + "grad_norm": 0.0012326318537816405, + "learning_rate": 0.09897876329677373, + "loss": 0.0513, + "num_input_tokens_seen": 22087328, + "step": 24420 + }, + { + "epoch": 6.446350798469051, + "grad_norm": 0.0024751678574830294, + "learning_rate": 0.09892337461731658, + "loss": 0.0412, + "num_input_tokens_seen": 22091904, + "step": 24425 + }, + { + "epoch": 6.447670582024548, + "grad_norm": 0.0016626258147880435, + "learning_rate": 0.09886799381451693, + "loss": 0.0389, + "num_input_tokens_seen": 22096128, + "step": 24430 + }, + { + "epoch": 6.448990365580045, + "grad_norm": 0.0015009380877017975, + "learning_rate": 0.09881262089691521, + "loss": 0.0358, + "num_input_tokens_seen": 22100640, + "step": 24435 + }, + { + "epoch": 6.450310149135542, + "grad_norm": 0.0016880714101716876, + "learning_rate": 0.09875725587305059, + "loss": 0.071, + "num_input_tokens_seen": 22105312, + "step": 24440 + }, + { + "epoch": 6.451629932691039, + "grad_norm": 0.000325327884638682, + "learning_rate": 0.09870189875146111, + "loss": 0.0262, + "num_input_tokens_seen": 22109888, + "step": 24445 + }, + { + "epoch": 6.452949716246535, + "grad_norm": 0.001266307896003127, + "learning_rate": 0.09864654954068346, + "loss": 0.0522, + "num_input_tokens_seen": 22114400, + "step": 24450 + }, + { + "epoch": 6.454269499802033, + "grad_norm": 0.003099048277363181, + "learning_rate": 0.09859120824925326, + "loss": 0.063, + "num_input_tokens_seen": 22118720, + "step": 24455 + }, + { + "epoch": 6.455589283357529, + "grad_norm": 0.001947605051100254, + "learning_rate": 0.09853587488570474, + "loss": 0.0532, + "num_input_tokens_seen": 22123328, + "step": 24460 + }, + { + "epoch": 6.456909066913026, + "grad_norm": 0.0028037193696945906, + "learning_rate": 0.09848054945857107, + "loss": 0.0481, + "num_input_tokens_seen": 22127744, + "step": 24465 + }, + { + "epoch": 6.458228850468523, + "grad_norm": 0.0004964504041709006, + "learning_rate": 0.09842523197638416, + "loss": 0.0273, + "num_input_tokens_seen": 22132192, + "step": 24470 + }, + { + "epoch": 6.45954863402402, + "grad_norm": 0.00183008867315948, + "learning_rate": 0.09836992244767452, + "loss": 0.0592, + "num_input_tokens_seen": 22136480, + "step": 24475 + }, + { + "epoch": 6.460868417579517, + "grad_norm": 0.0025124275125563145, + "learning_rate": 0.09831462088097168, + "loss": 0.039, + "num_input_tokens_seen": 22140768, + "step": 24480 + }, + { + "epoch": 6.462188201135014, + "grad_norm": 0.006078775506466627, + "learning_rate": 0.09825932728480385, + "loss": 0.0417, + "num_input_tokens_seen": 22145472, + "step": 24485 + }, + { + "epoch": 6.46350798469051, + "grad_norm": 0.001321542775258422, + "learning_rate": 0.09820404166769794, + "loss": 0.0295, + "num_input_tokens_seen": 22149856, + "step": 24490 + }, + { + "epoch": 6.464827768246008, + "grad_norm": 5.026101280236617e-05, + "learning_rate": 0.09814876403817978, + "loss": 0.0204, + "num_input_tokens_seen": 22154432, + "step": 24495 + }, + { + "epoch": 6.466147551801504, + "grad_norm": 0.0007734053069725633, + "learning_rate": 0.09809349440477376, + "loss": 0.0219, + "num_input_tokens_seen": 22158784, + "step": 24500 + }, + { + "epoch": 6.467467335357002, + "grad_norm": 0.002995854476466775, + "learning_rate": 0.09803823277600317, + "loss": 0.0318, + "num_input_tokens_seen": 22163200, + "step": 24505 + }, + { + "epoch": 6.468787118912498, + "grad_norm": 0.000660340825561434, + "learning_rate": 0.09798297916039014, + "loss": 0.0388, + "num_input_tokens_seen": 22167904, + "step": 24510 + }, + { + "epoch": 6.470106902467995, + "grad_norm": 0.00046194452443160117, + "learning_rate": 0.09792773356645534, + "loss": 0.0355, + "num_input_tokens_seen": 22172608, + "step": 24515 + }, + { + "epoch": 6.471426686023492, + "grad_norm": 0.0013499405467882752, + "learning_rate": 0.09787249600271843, + "loss": 0.0317, + "num_input_tokens_seen": 22176992, + "step": 24520 + }, + { + "epoch": 6.472746469578989, + "grad_norm": 0.004513613414019346, + "learning_rate": 0.09781726647769776, + "loss": 0.0861, + "num_input_tokens_seen": 22181824, + "step": 24525 + }, + { + "epoch": 6.474066253134486, + "grad_norm": 0.0013467356329783797, + "learning_rate": 0.0977620449999103, + "loss": 0.0184, + "num_input_tokens_seen": 22186336, + "step": 24530 + }, + { + "epoch": 6.475386036689983, + "grad_norm": 0.005221650470048189, + "learning_rate": 0.09770683157787204, + "loss": 0.0987, + "num_input_tokens_seen": 22190848, + "step": 24535 + }, + { + "epoch": 6.4767058202454795, + "grad_norm": 0.002308137947693467, + "learning_rate": 0.09765162622009745, + "loss": 0.0755, + "num_input_tokens_seen": 22195584, + "step": 24540 + }, + { + "epoch": 6.478025603800977, + "grad_norm": 0.0011064864229410887, + "learning_rate": 0.09759642893509995, + "loss": 0.0208, + "num_input_tokens_seen": 22200256, + "step": 24545 + }, + { + "epoch": 6.4793453873564735, + "grad_norm": 0.0015044745523482561, + "learning_rate": 0.09754123973139169, + "loss": 0.0627, + "num_input_tokens_seen": 22204320, + "step": 24550 + }, + { + "epoch": 6.48066517091197, + "grad_norm": 0.00456178979948163, + "learning_rate": 0.09748605861748345, + "loss": 0.0633, + "num_input_tokens_seen": 22208992, + "step": 24555 + }, + { + "epoch": 6.4819849544674675, + "grad_norm": 0.0031856803689152002, + "learning_rate": 0.0974308856018849, + "loss": 0.0645, + "num_input_tokens_seen": 22213632, + "step": 24560 + }, + { + "epoch": 6.483304738022964, + "grad_norm": 0.0009745904244482517, + "learning_rate": 0.09737572069310449, + "loss": 0.0917, + "num_input_tokens_seen": 22218144, + "step": 24565 + }, + { + "epoch": 6.4846245215784615, + "grad_norm": 0.0009636434260755777, + "learning_rate": 0.09732056389964922, + "loss": 0.0384, + "num_input_tokens_seen": 22222784, + "step": 24570 + }, + { + "epoch": 6.485944305133958, + "grad_norm": 0.0021543209441006184, + "learning_rate": 0.097265415230025, + "loss": 0.0507, + "num_input_tokens_seen": 22227200, + "step": 24575 + }, + { + "epoch": 6.487264088689455, + "grad_norm": 0.00274884095415473, + "learning_rate": 0.09721027469273648, + "loss": 0.057, + "num_input_tokens_seen": 22232096, + "step": 24580 + }, + { + "epoch": 6.488583872244952, + "grad_norm": 0.0024597758892923594, + "learning_rate": 0.09715514229628695, + "loss": 0.0649, + "num_input_tokens_seen": 22236448, + "step": 24585 + }, + { + "epoch": 6.489903655800449, + "grad_norm": 0.0013277764664962888, + "learning_rate": 0.09710001804917864, + "loss": 0.0426, + "num_input_tokens_seen": 22241056, + "step": 24590 + }, + { + "epoch": 6.491223439355946, + "grad_norm": 0.0007250452181324363, + "learning_rate": 0.09704490195991226, + "loss": 0.0739, + "num_input_tokens_seen": 22245568, + "step": 24595 + }, + { + "epoch": 6.492543222911443, + "grad_norm": 0.0008111625211313367, + "learning_rate": 0.09698979403698753, + "loss": 0.0411, + "num_input_tokens_seen": 22249984, + "step": 24600 + }, + { + "epoch": 6.492543222911443, + "eval_loss": 0.08602908253669739, + "eval_runtime": 76.0361, + "eval_samples_per_second": 88.576, + "eval_steps_per_second": 22.147, + "num_input_tokens_seen": 22249984, + "step": 24600 + }, + { + "epoch": 6.493863006466939, + "grad_norm": 0.0005937853129580617, + "learning_rate": 0.0969346942889027, + "loss": 0.0438, + "num_input_tokens_seen": 22254720, + "step": 24605 + }, + { + "epoch": 6.495182790022437, + "grad_norm": 0.002655579475685954, + "learning_rate": 0.09687960272415487, + "loss": 0.0541, + "num_input_tokens_seen": 22259168, + "step": 24610 + }, + { + "epoch": 6.496502573577933, + "grad_norm": 0.002607894828543067, + "learning_rate": 0.0968245193512399, + "loss": 0.0475, + "num_input_tokens_seen": 22263776, + "step": 24615 + }, + { + "epoch": 6.49782235713343, + "grad_norm": 0.0019617429934442043, + "learning_rate": 0.09676944417865221, + "loss": 0.079, + "num_input_tokens_seen": 22268352, + "step": 24620 + }, + { + "epoch": 6.499142140688927, + "grad_norm": 0.002440880285575986, + "learning_rate": 0.09671437721488517, + "loss": 0.0435, + "num_input_tokens_seen": 22272768, + "step": 24625 + }, + { + "epoch": 6.500461924244424, + "grad_norm": 0.0011704821372404695, + "learning_rate": 0.09665931846843086, + "loss": 0.0383, + "num_input_tokens_seen": 22277056, + "step": 24630 + }, + { + "epoch": 6.501781707799921, + "grad_norm": 0.0009898910066112876, + "learning_rate": 0.0966042679477799, + "loss": 0.0427, + "num_input_tokens_seen": 22281376, + "step": 24635 + }, + { + "epoch": 6.503101491355418, + "grad_norm": 0.000543515314348042, + "learning_rate": 0.09654922566142186, + "loss": 0.027, + "num_input_tokens_seen": 22285760, + "step": 24640 + }, + { + "epoch": 6.504421274910914, + "grad_norm": 0.002074067248031497, + "learning_rate": 0.09649419161784498, + "loss": 0.0639, + "num_input_tokens_seen": 22290688, + "step": 24645 + }, + { + "epoch": 6.505741058466412, + "grad_norm": 0.002333970507606864, + "learning_rate": 0.09643916582553606, + "loss": 0.0305, + "num_input_tokens_seen": 22295360, + "step": 24650 + }, + { + "epoch": 6.507060842021908, + "grad_norm": 0.0013164120027795434, + "learning_rate": 0.09638414829298093, + "loss": 0.0553, + "num_input_tokens_seen": 22300352, + "step": 24655 + }, + { + "epoch": 6.508380625577406, + "grad_norm": 0.003114184597507119, + "learning_rate": 0.09632913902866386, + "loss": 0.0847, + "num_input_tokens_seen": 22305216, + "step": 24660 + }, + { + "epoch": 6.509700409132902, + "grad_norm": 0.0005752110737375915, + "learning_rate": 0.096274138041068, + "loss": 0.0637, + "num_input_tokens_seen": 22309600, + "step": 24665 + }, + { + "epoch": 6.511020192688399, + "grad_norm": 0.0006221121875569224, + "learning_rate": 0.09621914533867527, + "loss": 0.0384, + "num_input_tokens_seen": 22313984, + "step": 24670 + }, + { + "epoch": 6.512339976243896, + "grad_norm": 0.001061448478139937, + "learning_rate": 0.09616416092996616, + "loss": 0.0309, + "num_input_tokens_seen": 22318464, + "step": 24675 + }, + { + "epoch": 6.513659759799393, + "grad_norm": 0.002021936234086752, + "learning_rate": 0.09610918482342, + "loss": 0.0666, + "num_input_tokens_seen": 22322944, + "step": 24680 + }, + { + "epoch": 6.51497954335489, + "grad_norm": 0.0018337336368858814, + "learning_rate": 0.09605421702751478, + "loss": 0.0429, + "num_input_tokens_seen": 22327584, + "step": 24685 + }, + { + "epoch": 6.516299326910387, + "grad_norm": 0.00315740704536438, + "learning_rate": 0.09599925755072718, + "loss": 0.0676, + "num_input_tokens_seen": 22332224, + "step": 24690 + }, + { + "epoch": 6.517619110465883, + "grad_norm": 0.0007358514121733606, + "learning_rate": 0.09594430640153273, + "loss": 0.027, + "num_input_tokens_seen": 22336928, + "step": 24695 + }, + { + "epoch": 6.518938894021381, + "grad_norm": 0.004617419093847275, + "learning_rate": 0.09588936358840547, + "loss": 0.0373, + "num_input_tokens_seen": 22341600, + "step": 24700 + }, + { + "epoch": 6.520258677576877, + "grad_norm": 0.000774516025558114, + "learning_rate": 0.09583442911981836, + "loss": 0.0261, + "num_input_tokens_seen": 22346112, + "step": 24705 + }, + { + "epoch": 6.521578461132374, + "grad_norm": 0.0011563139269128442, + "learning_rate": 0.09577950300424302, + "loss": 0.0428, + "num_input_tokens_seen": 22350560, + "step": 24710 + }, + { + "epoch": 6.522898244687871, + "grad_norm": 0.00026207262999378145, + "learning_rate": 0.09572458525014967, + "loss": 0.0333, + "num_input_tokens_seen": 22355008, + "step": 24715 + }, + { + "epoch": 6.524218028243368, + "grad_norm": 0.0006511469255201519, + "learning_rate": 0.0956696758660073, + "loss": 0.0119, + "num_input_tokens_seen": 22359648, + "step": 24720 + }, + { + "epoch": 6.5255378117988645, + "grad_norm": 0.0022627755533903837, + "learning_rate": 0.09561477486028373, + "loss": 0.055, + "num_input_tokens_seen": 22364288, + "step": 24725 + }, + { + "epoch": 6.526857595354362, + "grad_norm": 0.0007227793685160577, + "learning_rate": 0.09555988224144528, + "loss": 0.0386, + "num_input_tokens_seen": 22368672, + "step": 24730 + }, + { + "epoch": 6.5281773789098585, + "grad_norm": 0.002358479192480445, + "learning_rate": 0.09550499801795717, + "loss": 0.0479, + "num_input_tokens_seen": 22372992, + "step": 24735 + }, + { + "epoch": 6.529497162465356, + "grad_norm": 0.0017252835677936673, + "learning_rate": 0.09545012219828314, + "loss": 0.0404, + "num_input_tokens_seen": 22377376, + "step": 24740 + }, + { + "epoch": 6.5308169460208525, + "grad_norm": 0.002933276817202568, + "learning_rate": 0.09539525479088577, + "loss": 0.0613, + "num_input_tokens_seen": 22381728, + "step": 24745 + }, + { + "epoch": 6.532136729576349, + "grad_norm": 0.0031166868284344673, + "learning_rate": 0.0953403958042264, + "loss": 0.0463, + "num_input_tokens_seen": 22386272, + "step": 24750 + }, + { + "epoch": 6.5334565131318465, + "grad_norm": 0.000630988331977278, + "learning_rate": 0.09528554524676484, + "loss": 0.0331, + "num_input_tokens_seen": 22390880, + "step": 24755 + }, + { + "epoch": 6.534776296687343, + "grad_norm": 0.00015765178250148892, + "learning_rate": 0.09523070312695978, + "loss": 0.0254, + "num_input_tokens_seen": 22395808, + "step": 24760 + }, + { + "epoch": 6.5360960802428405, + "grad_norm": 0.00014752562856301665, + "learning_rate": 0.09517586945326863, + "loss": 0.0621, + "num_input_tokens_seen": 22400384, + "step": 24765 + }, + { + "epoch": 6.537415863798337, + "grad_norm": 0.00047242254368029535, + "learning_rate": 0.0951210442341473, + "loss": 0.0212, + "num_input_tokens_seen": 22404992, + "step": 24770 + }, + { + "epoch": 6.538735647353834, + "grad_norm": 0.002789333462715149, + "learning_rate": 0.09506622747805066, + "loss": 0.0268, + "num_input_tokens_seen": 22409600, + "step": 24775 + }, + { + "epoch": 6.540055430909331, + "grad_norm": 0.0018036317778751254, + "learning_rate": 0.09501141919343203, + "loss": 0.045, + "num_input_tokens_seen": 22413920, + "step": 24780 + }, + { + "epoch": 6.541375214464828, + "grad_norm": 0.002737664617598057, + "learning_rate": 0.09495661938874361, + "loss": 0.0686, + "num_input_tokens_seen": 22418624, + "step": 24785 + }, + { + "epoch": 6.542694998020325, + "grad_norm": 0.002477553440257907, + "learning_rate": 0.0949018280724362, + "loss": 0.03, + "num_input_tokens_seen": 22423296, + "step": 24790 + }, + { + "epoch": 6.544014781575822, + "grad_norm": 0.001394476043060422, + "learning_rate": 0.09484704525295934, + "loss": 0.047, + "num_input_tokens_seen": 22428064, + "step": 24795 + }, + { + "epoch": 6.545334565131318, + "grad_norm": 0.0005163912428542972, + "learning_rate": 0.09479227093876112, + "loss": 0.0506, + "num_input_tokens_seen": 22432352, + "step": 24800 + }, + { + "epoch": 6.545334565131318, + "eval_loss": 0.09081940352916718, + "eval_runtime": 75.976, + "eval_samples_per_second": 88.646, + "eval_steps_per_second": 22.165, + "num_input_tokens_seen": 22432352, + "step": 24800 + }, + { + "epoch": 6.546654348686816, + "grad_norm": 0.001664748415350914, + "learning_rate": 0.0947375051382886, + "loss": 0.0547, + "num_input_tokens_seen": 22436800, + "step": 24805 + }, + { + "epoch": 6.547974132242312, + "grad_norm": 0.0033865165896713734, + "learning_rate": 0.09468274785998718, + "loss": 0.0348, + "num_input_tokens_seen": 22441344, + "step": 24810 + }, + { + "epoch": 6.54929391579781, + "grad_norm": 0.002144625410437584, + "learning_rate": 0.09462799911230127, + "loss": 0.0474, + "num_input_tokens_seen": 22445824, + "step": 24815 + }, + { + "epoch": 6.550613699353306, + "grad_norm": 0.0011698472080752254, + "learning_rate": 0.0945732589036737, + "loss": 0.0436, + "num_input_tokens_seen": 22450144, + "step": 24820 + }, + { + "epoch": 6.551933482908803, + "grad_norm": 0.00042518964619375765, + "learning_rate": 0.09451852724254614, + "loss": 0.0427, + "num_input_tokens_seen": 22454688, + "step": 24825 + }, + { + "epoch": 6.5532532664643, + "grad_norm": 0.0020336417946964502, + "learning_rate": 0.09446380413735894, + "loss": 0.0741, + "num_input_tokens_seen": 22459104, + "step": 24830 + }, + { + "epoch": 6.554573050019797, + "grad_norm": 0.004691729787737131, + "learning_rate": 0.09440908959655099, + "loss": 0.0656, + "num_input_tokens_seen": 22463744, + "step": 24835 + }, + { + "epoch": 6.555892833575293, + "grad_norm": 0.002450589556246996, + "learning_rate": 0.09435438362856004, + "loss": 0.0579, + "num_input_tokens_seen": 22468512, + "step": 24840 + }, + { + "epoch": 6.557212617130791, + "grad_norm": 0.0029565508011728525, + "learning_rate": 0.0942996862418225, + "loss": 0.0527, + "num_input_tokens_seen": 22472800, + "step": 24845 + }, + { + "epoch": 6.558532400686287, + "grad_norm": 0.0013239930849522352, + "learning_rate": 0.09424499744477322, + "loss": 0.0411, + "num_input_tokens_seen": 22477312, + "step": 24850 + }, + { + "epoch": 6.559852184241785, + "grad_norm": 0.001299932599067688, + "learning_rate": 0.09419031724584608, + "loss": 0.0325, + "num_input_tokens_seen": 22481824, + "step": 24855 + }, + { + "epoch": 6.561171967797281, + "grad_norm": 0.0017625490436330438, + "learning_rate": 0.09413564565347331, + "loss": 0.0468, + "num_input_tokens_seen": 22486304, + "step": 24860 + }, + { + "epoch": 6.562491751352778, + "grad_norm": 0.003230612725019455, + "learning_rate": 0.094080982676086, + "loss": 0.0508, + "num_input_tokens_seen": 22490720, + "step": 24865 + }, + { + "epoch": 6.563811534908275, + "grad_norm": 0.0018336949869990349, + "learning_rate": 0.09402632832211395, + "loss": 0.0633, + "num_input_tokens_seen": 22495328, + "step": 24870 + }, + { + "epoch": 6.565131318463772, + "grad_norm": 0.0042235516011714935, + "learning_rate": 0.09397168259998541, + "loss": 0.074, + "num_input_tokens_seen": 22499648, + "step": 24875 + }, + { + "epoch": 6.566451102019268, + "grad_norm": 0.00023661462182644755, + "learning_rate": 0.09391704551812759, + "loss": 0.0296, + "num_input_tokens_seen": 22504224, + "step": 24880 + }, + { + "epoch": 6.567770885574766, + "grad_norm": 0.003391029080376029, + "learning_rate": 0.09386241708496605, + "loss": 0.0412, + "num_input_tokens_seen": 22508768, + "step": 24885 + }, + { + "epoch": 6.569090669130262, + "grad_norm": 0.0036591950338333845, + "learning_rate": 0.09380779730892527, + "loss": 0.0365, + "num_input_tokens_seen": 22513440, + "step": 24890 + }, + { + "epoch": 6.57041045268576, + "grad_norm": 0.0005907119484618306, + "learning_rate": 0.09375318619842836, + "loss": 0.0245, + "num_input_tokens_seen": 22517920, + "step": 24895 + }, + { + "epoch": 6.571730236241256, + "grad_norm": 0.001957801403477788, + "learning_rate": 0.09369858376189696, + "loss": 0.0414, + "num_input_tokens_seen": 22522304, + "step": 24900 + }, + { + "epoch": 6.573050019796753, + "grad_norm": 0.0022004127968102694, + "learning_rate": 0.09364399000775143, + "loss": 0.0524, + "num_input_tokens_seen": 22526624, + "step": 24905 + }, + { + "epoch": 6.57436980335225, + "grad_norm": 0.0008524220902472734, + "learning_rate": 0.09358940494441093, + "loss": 0.0351, + "num_input_tokens_seen": 22531264, + "step": 24910 + }, + { + "epoch": 6.575689586907747, + "grad_norm": 0.0034026873763650656, + "learning_rate": 0.09353482858029301, + "loss": 0.0714, + "num_input_tokens_seen": 22535904, + "step": 24915 + }, + { + "epoch": 6.577009370463244, + "grad_norm": 0.0012116332072764635, + "learning_rate": 0.09348026092381419, + "loss": 0.0342, + "num_input_tokens_seen": 22540352, + "step": 24920 + }, + { + "epoch": 6.578329154018741, + "grad_norm": 0.001439591869711876, + "learning_rate": 0.09342570198338931, + "loss": 0.0485, + "num_input_tokens_seen": 22544928, + "step": 24925 + }, + { + "epoch": 6.5796489375742375, + "grad_norm": 0.0015872269868850708, + "learning_rate": 0.0933711517674322, + "loss": 0.0737, + "num_input_tokens_seen": 22549152, + "step": 24930 + }, + { + "epoch": 6.580968721129735, + "grad_norm": 0.0011262173065915704, + "learning_rate": 0.09331661028435513, + "loss": 0.0487, + "num_input_tokens_seen": 22554016, + "step": 24935 + }, + { + "epoch": 6.5822885046852315, + "grad_norm": 0.00317199295386672, + "learning_rate": 0.09326207754256909, + "loss": 0.0655, + "num_input_tokens_seen": 22558368, + "step": 24940 + }, + { + "epoch": 6.583608288240729, + "grad_norm": 0.002740168944001198, + "learning_rate": 0.09320755355048366, + "loss": 0.0383, + "num_input_tokens_seen": 22562944, + "step": 24945 + }, + { + "epoch": 6.5849280717962255, + "grad_norm": 0.0018983116606250405, + "learning_rate": 0.09315303831650722, + "loss": 0.0253, + "num_input_tokens_seen": 22567648, + "step": 24950 + }, + { + "epoch": 6.586247855351722, + "grad_norm": 0.004290704149752855, + "learning_rate": 0.09309853184904661, + "loss": 0.0955, + "num_input_tokens_seen": 22572160, + "step": 24955 + }, + { + "epoch": 6.5875676389072195, + "grad_norm": 0.0009470511577092111, + "learning_rate": 0.09304403415650753, + "loss": 0.0434, + "num_input_tokens_seen": 22576608, + "step": 24960 + }, + { + "epoch": 6.588887422462716, + "grad_norm": 0.0017517434898763895, + "learning_rate": 0.09298954524729405, + "loss": 0.0704, + "num_input_tokens_seen": 22581120, + "step": 24965 + }, + { + "epoch": 6.590207206018213, + "grad_norm": 0.002280614571645856, + "learning_rate": 0.09293506512980916, + "loss": 0.1133, + "num_input_tokens_seen": 22585760, + "step": 24970 + }, + { + "epoch": 6.59152698957371, + "grad_norm": 0.0023319339379668236, + "learning_rate": 0.0928805938124544, + "loss": 0.0257, + "num_input_tokens_seen": 22590080, + "step": 24975 + }, + { + "epoch": 6.592846773129207, + "grad_norm": 0.0028192372992634773, + "learning_rate": 0.09282613130362982, + "loss": 0.0488, + "num_input_tokens_seen": 22594592, + "step": 24980 + }, + { + "epoch": 6.594166556684704, + "grad_norm": 0.0010565390111878514, + "learning_rate": 0.09277167761173427, + "loss": 0.0398, + "num_input_tokens_seen": 22599136, + "step": 24985 + }, + { + "epoch": 6.595486340240201, + "grad_norm": 0.0011409983271732926, + "learning_rate": 0.0927172327451653, + "loss": 0.0583, + "num_input_tokens_seen": 22603488, + "step": 24990 + }, + { + "epoch": 6.596806123795697, + "grad_norm": 0.0032815660815685987, + "learning_rate": 0.09266279671231882, + "loss": 0.0416, + "num_input_tokens_seen": 22607968, + "step": 24995 + }, + { + "epoch": 6.598125907351195, + "grad_norm": 0.001838360563851893, + "learning_rate": 0.09260836952158967, + "loss": 0.0504, + "num_input_tokens_seen": 22612672, + "step": 25000 + }, + { + "epoch": 6.598125907351195, + "eval_loss": 0.08927285671234131, + "eval_runtime": 75.9639, + "eval_samples_per_second": 88.66, + "eval_steps_per_second": 22.168, + "num_input_tokens_seen": 22612672, + "step": 25000 + }, + { + "epoch": 6.599445690906691, + "grad_norm": 0.0016983801033347845, + "learning_rate": 0.09255395118137114, + "loss": 0.0473, + "num_input_tokens_seen": 22616992, + "step": 25005 + }, + { + "epoch": 6.600765474462188, + "grad_norm": 0.0012362406123429537, + "learning_rate": 0.09249954170005527, + "loss": 0.0423, + "num_input_tokens_seen": 22621504, + "step": 25010 + }, + { + "epoch": 6.602085258017685, + "grad_norm": 0.0032890206202864647, + "learning_rate": 0.0924451410860327, + "loss": 0.0767, + "num_input_tokens_seen": 22626368, + "step": 25015 + }, + { + "epoch": 6.603405041573182, + "grad_norm": 0.004132839385420084, + "learning_rate": 0.09239074934769258, + "loss": 0.0793, + "num_input_tokens_seen": 22631296, + "step": 25020 + }, + { + "epoch": 6.604724825128679, + "grad_norm": 0.0014122625580057502, + "learning_rate": 0.09233636649342288, + "loss": 0.0713, + "num_input_tokens_seen": 22635616, + "step": 25025 + }, + { + "epoch": 6.606044608684176, + "grad_norm": 0.003170388750731945, + "learning_rate": 0.09228199253161017, + "loss": 0.0655, + "num_input_tokens_seen": 22639968, + "step": 25030 + }, + { + "epoch": 6.607364392239672, + "grad_norm": 0.0006149118416942656, + "learning_rate": 0.09222762747063949, + "loss": 0.0278, + "num_input_tokens_seen": 22644704, + "step": 25035 + }, + { + "epoch": 6.60868417579517, + "grad_norm": 0.0026581890415400267, + "learning_rate": 0.09217327131889473, + "loss": 0.0539, + "num_input_tokens_seen": 22649088, + "step": 25040 + }, + { + "epoch": 6.610003959350666, + "grad_norm": 0.0020966280717402697, + "learning_rate": 0.09211892408475818, + "loss": 0.0448, + "num_input_tokens_seen": 22653664, + "step": 25045 + }, + { + "epoch": 6.611323742906164, + "grad_norm": 0.0036925221793353558, + "learning_rate": 0.09206458577661089, + "loss": 0.0417, + "num_input_tokens_seen": 22658528, + "step": 25050 + }, + { + "epoch": 6.61264352646166, + "grad_norm": 0.0012761922553181648, + "learning_rate": 0.09201025640283263, + "loss": 0.0317, + "num_input_tokens_seen": 22662944, + "step": 25055 + }, + { + "epoch": 6.613963310017157, + "grad_norm": 0.001352987252175808, + "learning_rate": 0.09195593597180148, + "loss": 0.0234, + "num_input_tokens_seen": 22667488, + "step": 25060 + }, + { + "epoch": 6.615283093572654, + "grad_norm": 0.005330657586455345, + "learning_rate": 0.09190162449189444, + "loss": 0.05, + "num_input_tokens_seen": 22672128, + "step": 25065 + }, + { + "epoch": 6.616602877128151, + "grad_norm": 0.0006056731217540801, + "learning_rate": 0.09184732197148705, + "loss": 0.049, + "num_input_tokens_seen": 22676800, + "step": 25070 + }, + { + "epoch": 6.617922660683648, + "grad_norm": 0.0012822651769965887, + "learning_rate": 0.09179302841895343, + "loss": 0.0225, + "num_input_tokens_seen": 22681280, + "step": 25075 + }, + { + "epoch": 6.619242444239145, + "grad_norm": 0.0010857151355594397, + "learning_rate": 0.09173874384266625, + "loss": 0.0611, + "num_input_tokens_seen": 22685600, + "step": 25080 + }, + { + "epoch": 6.620562227794641, + "grad_norm": 0.0031226826831698418, + "learning_rate": 0.09168446825099695, + "loss": 0.0476, + "num_input_tokens_seen": 22689856, + "step": 25085 + }, + { + "epoch": 6.621882011350139, + "grad_norm": 0.0019347027409821749, + "learning_rate": 0.09163020165231545, + "loss": 0.0624, + "num_input_tokens_seen": 22694176, + "step": 25090 + }, + { + "epoch": 6.623201794905635, + "grad_norm": 0.0026674973778426647, + "learning_rate": 0.09157594405499044, + "loss": 0.045, + "num_input_tokens_seen": 22698624, + "step": 25095 + }, + { + "epoch": 6.624521578461132, + "grad_norm": 0.002756253583356738, + "learning_rate": 0.09152169546738899, + "loss": 0.0267, + "num_input_tokens_seen": 22703232, + "step": 25100 + }, + { + "epoch": 6.625841362016629, + "grad_norm": 0.001179149141535163, + "learning_rate": 0.09146745589787698, + "loss": 0.0266, + "num_input_tokens_seen": 22707584, + "step": 25105 + }, + { + "epoch": 6.627161145572126, + "grad_norm": 0.0018349396996200085, + "learning_rate": 0.09141322535481891, + "loss": 0.053, + "num_input_tokens_seen": 22711840, + "step": 25110 + }, + { + "epoch": 6.628480929127623, + "grad_norm": 0.002010544529184699, + "learning_rate": 0.0913590038465777, + "loss": 0.0748, + "num_input_tokens_seen": 22716448, + "step": 25115 + }, + { + "epoch": 6.62980071268312, + "grad_norm": 0.000590840878430754, + "learning_rate": 0.09130479138151505, + "loss": 0.056, + "num_input_tokens_seen": 22720960, + "step": 25120 + }, + { + "epoch": 6.6311204962386165, + "grad_norm": 0.001190326176583767, + "learning_rate": 0.09125058796799114, + "loss": 0.0254, + "num_input_tokens_seen": 22725472, + "step": 25125 + }, + { + "epoch": 6.632440279794114, + "grad_norm": 0.0014430468436330557, + "learning_rate": 0.09119639361436485, + "loss": 0.0539, + "num_input_tokens_seen": 22729760, + "step": 25130 + }, + { + "epoch": 6.6337600633496105, + "grad_norm": 0.000752709514927119, + "learning_rate": 0.09114220832899368, + "loss": 0.034, + "num_input_tokens_seen": 22734336, + "step": 25135 + }, + { + "epoch": 6.635079846905107, + "grad_norm": 0.002453487366437912, + "learning_rate": 0.0910880321202336, + "loss": 0.0603, + "num_input_tokens_seen": 22738816, + "step": 25140 + }, + { + "epoch": 6.6363996304606045, + "grad_norm": 0.002518797293305397, + "learning_rate": 0.09103386499643933, + "loss": 0.06, + "num_input_tokens_seen": 22743488, + "step": 25145 + }, + { + "epoch": 6.637719414016101, + "grad_norm": 0.0024216300807893276, + "learning_rate": 0.09097970696596407, + "loss": 0.0491, + "num_input_tokens_seen": 22747840, + "step": 25150 + }, + { + "epoch": 6.6390391975715985, + "grad_norm": 0.0010211450280621648, + "learning_rate": 0.09092555803715971, + "loss": 0.0483, + "num_input_tokens_seen": 22752416, + "step": 25155 + }, + { + "epoch": 6.640358981127095, + "grad_norm": 0.0035376211162656546, + "learning_rate": 0.0908714182183767, + "loss": 0.0798, + "num_input_tokens_seen": 22757472, + "step": 25160 + }, + { + "epoch": 6.641678764682592, + "grad_norm": 0.0021039803978055716, + "learning_rate": 0.090817287517964, + "loss": 0.045, + "num_input_tokens_seen": 22761632, + "step": 25165 + }, + { + "epoch": 6.642998548238089, + "grad_norm": 0.001116974395699799, + "learning_rate": 0.09076316594426931, + "loss": 0.0328, + "num_input_tokens_seen": 22766080, + "step": 25170 + }, + { + "epoch": 6.644318331793586, + "grad_norm": 0.001048059668391943, + "learning_rate": 0.09070905350563888, + "loss": 0.0727, + "num_input_tokens_seen": 22770784, + "step": 25175 + }, + { + "epoch": 6.645638115349083, + "grad_norm": 0.001975392457097769, + "learning_rate": 0.09065495021041745, + "loss": 0.0703, + "num_input_tokens_seen": 22775360, + "step": 25180 + }, + { + "epoch": 6.64695789890458, + "grad_norm": 0.003793458454310894, + "learning_rate": 0.09060085606694851, + "loss": 0.0544, + "num_input_tokens_seen": 22780096, + "step": 25185 + }, + { + "epoch": 6.648277682460076, + "grad_norm": 0.0033764580730348825, + "learning_rate": 0.09054677108357405, + "loss": 0.0754, + "num_input_tokens_seen": 22784960, + "step": 25190 + }, + { + "epoch": 6.649597466015574, + "grad_norm": 0.0006199695635586977, + "learning_rate": 0.09049269526863457, + "loss": 0.0331, + "num_input_tokens_seen": 22789536, + "step": 25195 + }, + { + "epoch": 6.65091724957107, + "grad_norm": 0.0020717219449579716, + "learning_rate": 0.09043862863046935, + "loss": 0.0514, + "num_input_tokens_seen": 22793920, + "step": 25200 + }, + { + "epoch": 6.65091724957107, + "eval_loss": 0.08937113732099533, + "eval_runtime": 75.8652, + "eval_samples_per_second": 88.776, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 22793920, + "step": 25200 + }, + { + "epoch": 6.652237033126568, + "grad_norm": 0.0022351816296577454, + "learning_rate": 0.09038457117741602, + "loss": 0.0815, + "num_input_tokens_seen": 22798464, + "step": 25205 + }, + { + "epoch": 6.653556816682064, + "grad_norm": 0.005045734811574221, + "learning_rate": 0.09033052291781099, + "loss": 0.0343, + "num_input_tokens_seen": 22803232, + "step": 25210 + }, + { + "epoch": 6.654876600237561, + "grad_norm": 0.003043610602617264, + "learning_rate": 0.09027648385998926, + "loss": 0.0416, + "num_input_tokens_seen": 22807360, + "step": 25215 + }, + { + "epoch": 6.656196383793058, + "grad_norm": 0.00659911846742034, + "learning_rate": 0.09022245401228417, + "loss": 0.0746, + "num_input_tokens_seen": 22811872, + "step": 25220 + }, + { + "epoch": 6.657516167348555, + "grad_norm": 0.000888113456312567, + "learning_rate": 0.09016843338302792, + "loss": 0.0609, + "num_input_tokens_seen": 22816576, + "step": 25225 + }, + { + "epoch": 6.658835950904052, + "grad_norm": 0.00037413553218357265, + "learning_rate": 0.09011442198055115, + "loss": 0.0244, + "num_input_tokens_seen": 22820960, + "step": 25230 + }, + { + "epoch": 6.660155734459549, + "grad_norm": 0.0004158861702308059, + "learning_rate": 0.09006041981318305, + "loss": 0.045, + "num_input_tokens_seen": 22825312, + "step": 25235 + }, + { + "epoch": 6.661475518015045, + "grad_norm": 0.0026902547106146812, + "learning_rate": 0.09000642688925149, + "loss": 0.0484, + "num_input_tokens_seen": 22829824, + "step": 25240 + }, + { + "epoch": 6.662795301570543, + "grad_norm": 0.0009343764395453036, + "learning_rate": 0.0899524432170828, + "loss": 0.0474, + "num_input_tokens_seen": 22834432, + "step": 25245 + }, + { + "epoch": 6.664115085126039, + "grad_norm": 0.002381154103204608, + "learning_rate": 0.08989846880500196, + "loss": 0.041, + "num_input_tokens_seen": 22838912, + "step": 25250 + }, + { + "epoch": 6.665434868681536, + "grad_norm": 0.0028014671988785267, + "learning_rate": 0.08984450366133256, + "loss": 0.0539, + "num_input_tokens_seen": 22843392, + "step": 25255 + }, + { + "epoch": 6.666754652237033, + "grad_norm": 0.0020497979130595922, + "learning_rate": 0.08979054779439664, + "loss": 0.0682, + "num_input_tokens_seen": 22847744, + "step": 25260 + }, + { + "epoch": 6.66807443579253, + "grad_norm": 0.00281157367862761, + "learning_rate": 0.08973660121251485, + "loss": 0.0796, + "num_input_tokens_seen": 22852736, + "step": 25265 + }, + { + "epoch": 6.6693942193480265, + "grad_norm": 0.002015036530792713, + "learning_rate": 0.08968266392400655, + "loss": 0.0331, + "num_input_tokens_seen": 22857344, + "step": 25270 + }, + { + "epoch": 6.670714002903524, + "grad_norm": 0.0017772329738363624, + "learning_rate": 0.0896287359371894, + "loss": 0.0399, + "num_input_tokens_seen": 22861792, + "step": 25275 + }, + { + "epoch": 6.6720337864590205, + "grad_norm": 0.002093160292133689, + "learning_rate": 0.08957481726037989, + "loss": 0.0455, + "num_input_tokens_seen": 22866080, + "step": 25280 + }, + { + "epoch": 6.673353570014518, + "grad_norm": 0.002746366895735264, + "learning_rate": 0.08952090790189286, + "loss": 0.047, + "num_input_tokens_seen": 22870624, + "step": 25285 + }, + { + "epoch": 6.6746733535700145, + "grad_norm": 0.0010965615510940552, + "learning_rate": 0.08946700787004187, + "loss": 0.0554, + "num_input_tokens_seen": 22875072, + "step": 25290 + }, + { + "epoch": 6.675993137125511, + "grad_norm": 0.0016076458850875497, + "learning_rate": 0.08941311717313899, + "loss": 0.0594, + "num_input_tokens_seen": 22879424, + "step": 25295 + }, + { + "epoch": 6.6773129206810085, + "grad_norm": 0.004504163283854723, + "learning_rate": 0.08935923581949483, + "loss": 0.0643, + "num_input_tokens_seen": 22883936, + "step": 25300 + }, + { + "epoch": 6.678632704236505, + "grad_norm": 0.0015496299602091312, + "learning_rate": 0.0893053638174185, + "loss": 0.0316, + "num_input_tokens_seen": 22888448, + "step": 25305 + }, + { + "epoch": 6.6799524877920025, + "grad_norm": 0.0007118090288713574, + "learning_rate": 0.0892515011752179, + "loss": 0.0259, + "num_input_tokens_seen": 22893216, + "step": 25310 + }, + { + "epoch": 6.681272271347499, + "grad_norm": 0.0016915422165766358, + "learning_rate": 0.08919764790119918, + "loss": 0.0406, + "num_input_tokens_seen": 22897472, + "step": 25315 + }, + { + "epoch": 6.682592054902996, + "grad_norm": 0.0020553192589432, + "learning_rate": 0.08914380400366727, + "loss": 0.0788, + "num_input_tokens_seen": 22902048, + "step": 25320 + }, + { + "epoch": 6.683911838458493, + "grad_norm": 0.005953369662165642, + "learning_rate": 0.08908996949092551, + "loss": 0.0699, + "num_input_tokens_seen": 22906496, + "step": 25325 + }, + { + "epoch": 6.68523162201399, + "grad_norm": 0.002344129141420126, + "learning_rate": 0.08903614437127592, + "loss": 0.0352, + "num_input_tokens_seen": 22911072, + "step": 25330 + }, + { + "epoch": 6.686551405569487, + "grad_norm": 0.0030412471387535334, + "learning_rate": 0.088982328653019, + "loss": 0.0461, + "num_input_tokens_seen": 22915584, + "step": 25335 + }, + { + "epoch": 6.687871189124984, + "grad_norm": 0.0007593342452310026, + "learning_rate": 0.0889285223444538, + "loss": 0.0323, + "num_input_tokens_seen": 22920544, + "step": 25340 + }, + { + "epoch": 6.68919097268048, + "grad_norm": 0.002346183406189084, + "learning_rate": 0.08887472545387787, + "loss": 0.0812, + "num_input_tokens_seen": 22925280, + "step": 25345 + }, + { + "epoch": 6.690510756235978, + "grad_norm": 0.0012336279032751918, + "learning_rate": 0.08882093798958751, + "loss": 0.0926, + "num_input_tokens_seen": 22929792, + "step": 25350 + }, + { + "epoch": 6.691830539791474, + "grad_norm": 0.0011427184799686074, + "learning_rate": 0.08876715995987726, + "loss": 0.041, + "num_input_tokens_seen": 22934368, + "step": 25355 + }, + { + "epoch": 6.693150323346972, + "grad_norm": 0.0010187173029407859, + "learning_rate": 0.08871339137304052, + "loss": 0.0318, + "num_input_tokens_seen": 22938816, + "step": 25360 + }, + { + "epoch": 6.694470106902468, + "grad_norm": 0.001895753899589181, + "learning_rate": 0.0886596322373689, + "loss": 0.0545, + "num_input_tokens_seen": 22943392, + "step": 25365 + }, + { + "epoch": 6.695789890457965, + "grad_norm": 0.0010454223956912756, + "learning_rate": 0.08860588256115293, + "loss": 0.0815, + "num_input_tokens_seen": 22948128, + "step": 25370 + }, + { + "epoch": 6.697109674013462, + "grad_norm": 0.00388863543048501, + "learning_rate": 0.0885521423526814, + "loss": 0.0512, + "num_input_tokens_seen": 22952832, + "step": 25375 + }, + { + "epoch": 6.698429457568959, + "grad_norm": 0.001658741501159966, + "learning_rate": 0.08849841162024165, + "loss": 0.0178, + "num_input_tokens_seen": 22957504, + "step": 25380 + }, + { + "epoch": 6.699749241124455, + "grad_norm": 0.0026791361160576344, + "learning_rate": 0.08844469037211973, + "loss": 0.0587, + "num_input_tokens_seen": 22962080, + "step": 25385 + }, + { + "epoch": 6.701069024679953, + "grad_norm": 0.0008362135849893093, + "learning_rate": 0.08839097861660014, + "loss": 0.0335, + "num_input_tokens_seen": 22966496, + "step": 25390 + }, + { + "epoch": 6.702388808235449, + "grad_norm": 0.001200098660774529, + "learning_rate": 0.08833727636196585, + "loss": 0.0496, + "num_input_tokens_seen": 22970624, + "step": 25395 + }, + { + "epoch": 6.703708591790946, + "grad_norm": 0.003736930899322033, + "learning_rate": 0.08828358361649848, + "loss": 0.1031, + "num_input_tokens_seen": 22974976, + "step": 25400 + }, + { + "epoch": 6.703708591790946, + "eval_loss": 0.08643987029790878, + "eval_runtime": 75.9444, + "eval_samples_per_second": 88.683, + "eval_steps_per_second": 22.174, + "num_input_tokens_seen": 22974976, + "step": 25400 + }, + { + "epoch": 6.705028375346443, + "grad_norm": 0.001345103490166366, + "learning_rate": 0.08822990038847807, + "loss": 0.0455, + "num_input_tokens_seen": 22979328, + "step": 25405 + }, + { + "epoch": 6.70634815890194, + "grad_norm": 0.0007912686560302973, + "learning_rate": 0.08817622668618325, + "loss": 0.0361, + "num_input_tokens_seen": 22983712, + "step": 25410 + }, + { + "epoch": 6.707667942457437, + "grad_norm": 0.0023191296495497227, + "learning_rate": 0.08812256251789125, + "loss": 0.0357, + "num_input_tokens_seen": 22988096, + "step": 25415 + }, + { + "epoch": 6.708987726012934, + "grad_norm": 0.0023595995735377073, + "learning_rate": 0.08806890789187766, + "loss": 0.0441, + "num_input_tokens_seen": 22993056, + "step": 25420 + }, + { + "epoch": 6.71030750956843, + "grad_norm": 0.0031044501811265945, + "learning_rate": 0.08801526281641672, + "loss": 0.0578, + "num_input_tokens_seen": 22997600, + "step": 25425 + }, + { + "epoch": 6.711627293123928, + "grad_norm": 0.0035335177090018988, + "learning_rate": 0.0879616272997813, + "loss": 0.0296, + "num_input_tokens_seen": 23002272, + "step": 25430 + }, + { + "epoch": 6.712947076679424, + "grad_norm": 0.003468101844191551, + "learning_rate": 0.08790800135024247, + "loss": 0.1196, + "num_input_tokens_seen": 23006848, + "step": 25435 + }, + { + "epoch": 6.714266860234922, + "grad_norm": 0.0011793534504249692, + "learning_rate": 0.08785438497607023, + "loss": 0.0252, + "num_input_tokens_seen": 23011520, + "step": 25440 + }, + { + "epoch": 6.715586643790418, + "grad_norm": 0.0003712165344040841, + "learning_rate": 0.08780077818553277, + "loss": 0.0227, + "num_input_tokens_seen": 23015808, + "step": 25445 + }, + { + "epoch": 6.716906427345915, + "grad_norm": 0.0032289305236190557, + "learning_rate": 0.0877471809868969, + "loss": 0.1186, + "num_input_tokens_seen": 23020128, + "step": 25450 + }, + { + "epoch": 6.718226210901412, + "grad_norm": 0.0023694043047726154, + "learning_rate": 0.08769359338842811, + "loss": 0.0696, + "num_input_tokens_seen": 23024544, + "step": 25455 + }, + { + "epoch": 6.719545994456909, + "grad_norm": 0.003181065898388624, + "learning_rate": 0.08764001539839016, + "loss": 0.0422, + "num_input_tokens_seen": 23029088, + "step": 25460 + }, + { + "epoch": 6.720865778012406, + "grad_norm": 0.0024535772390663624, + "learning_rate": 0.08758644702504548, + "loss": 0.0498, + "num_input_tokens_seen": 23033376, + "step": 25465 + }, + { + "epoch": 6.722185561567903, + "grad_norm": 0.002155701396986842, + "learning_rate": 0.0875328882766551, + "loss": 0.0595, + "num_input_tokens_seen": 23038176, + "step": 25470 + }, + { + "epoch": 6.7235053451233995, + "grad_norm": 0.0035993619821965694, + "learning_rate": 0.08747933916147828, + "loss": 0.0795, + "num_input_tokens_seen": 23042912, + "step": 25475 + }, + { + "epoch": 6.724825128678897, + "grad_norm": 0.0005594154354184866, + "learning_rate": 0.0874257996877731, + "loss": 0.0561, + "num_input_tokens_seen": 23047264, + "step": 25480 + }, + { + "epoch": 6.7261449122343935, + "grad_norm": 0.0038533515762537718, + "learning_rate": 0.08737226986379593, + "loss": 0.0733, + "num_input_tokens_seen": 23051712, + "step": 25485 + }, + { + "epoch": 6.727464695789891, + "grad_norm": 0.0017536813393235207, + "learning_rate": 0.08731874969780173, + "loss": 0.0883, + "num_input_tokens_seen": 23056320, + "step": 25490 + }, + { + "epoch": 6.7287844793453875, + "grad_norm": 0.0021479555871337652, + "learning_rate": 0.08726523919804412, + "loss": 0.0553, + "num_input_tokens_seen": 23061248, + "step": 25495 + }, + { + "epoch": 6.730104262900884, + "grad_norm": 0.0017701751785352826, + "learning_rate": 0.08721173837277492, + "loss": 0.0499, + "num_input_tokens_seen": 23065984, + "step": 25500 + }, + { + "epoch": 6.7314240464563815, + "grad_norm": 0.0007400967297144234, + "learning_rate": 0.08715824723024479, + "loss": 0.0458, + "num_input_tokens_seen": 23070624, + "step": 25505 + }, + { + "epoch": 6.732743830011878, + "grad_norm": 0.0034066669177263975, + "learning_rate": 0.08710476577870258, + "loss": 0.0634, + "num_input_tokens_seen": 23075008, + "step": 25510 + }, + { + "epoch": 6.734063613567375, + "grad_norm": 0.00247441534884274, + "learning_rate": 0.08705129402639587, + "loss": 0.0458, + "num_input_tokens_seen": 23079648, + "step": 25515 + }, + { + "epoch": 6.735383397122872, + "grad_norm": 0.0022395318374037743, + "learning_rate": 0.08699783198157078, + "loss": 0.0862, + "num_input_tokens_seen": 23084288, + "step": 25520 + }, + { + "epoch": 6.736703180678369, + "grad_norm": 0.0015221989015117288, + "learning_rate": 0.08694437965247163, + "loss": 0.0676, + "num_input_tokens_seen": 23088704, + "step": 25525 + }, + { + "epoch": 6.738022964233865, + "grad_norm": 0.0012543962802737951, + "learning_rate": 0.08689093704734165, + "loss": 0.0338, + "num_input_tokens_seen": 23092928, + "step": 25530 + }, + { + "epoch": 6.739342747789363, + "grad_norm": 0.001527509419247508, + "learning_rate": 0.08683750417442222, + "loss": 0.0364, + "num_input_tokens_seen": 23097024, + "step": 25535 + }, + { + "epoch": 6.740662531344859, + "grad_norm": 0.0013248411705717444, + "learning_rate": 0.08678408104195334, + "loss": 0.0337, + "num_input_tokens_seen": 23101728, + "step": 25540 + }, + { + "epoch": 6.741982314900357, + "grad_norm": 0.0016046997625380754, + "learning_rate": 0.08673066765817365, + "loss": 0.0605, + "num_input_tokens_seen": 23106240, + "step": 25545 + }, + { + "epoch": 6.743302098455853, + "grad_norm": 0.003652298590168357, + "learning_rate": 0.08667726403132005, + "loss": 0.0628, + "num_input_tokens_seen": 23110784, + "step": 25550 + }, + { + "epoch": 6.74462188201135, + "grad_norm": 0.0038108518347144127, + "learning_rate": 0.0866238701696281, + "loss": 0.0368, + "num_input_tokens_seen": 23115296, + "step": 25555 + }, + { + "epoch": 6.745941665566847, + "grad_norm": 0.001784485881216824, + "learning_rate": 0.08657048608133185, + "loss": 0.0564, + "num_input_tokens_seen": 23119936, + "step": 25560 + }, + { + "epoch": 6.747261449122344, + "grad_norm": 0.002097681164741516, + "learning_rate": 0.08651711177466369, + "loss": 0.0607, + "num_input_tokens_seen": 23124448, + "step": 25565 + }, + { + "epoch": 6.748581232677841, + "grad_norm": 0.0009807293536141515, + "learning_rate": 0.08646374725785466, + "loss": 0.0358, + "num_input_tokens_seen": 23128992, + "step": 25570 + }, + { + "epoch": 6.749901016233338, + "grad_norm": 0.002547488547861576, + "learning_rate": 0.08641039253913434, + "loss": 0.053, + "num_input_tokens_seen": 23133472, + "step": 25575 + }, + { + "epoch": 6.751220799788834, + "grad_norm": 0.0023343467619270086, + "learning_rate": 0.08635704762673052, + "loss": 0.0494, + "num_input_tokens_seen": 23137920, + "step": 25580 + }, + { + "epoch": 6.752540583344332, + "grad_norm": 0.0003731203032657504, + "learning_rate": 0.08630371252886981, + "loss": 0.0517, + "num_input_tokens_seen": 23142272, + "step": 25585 + }, + { + "epoch": 6.753860366899828, + "grad_norm": 0.002046659355983138, + "learning_rate": 0.08625038725377704, + "loss": 0.0546, + "num_input_tokens_seen": 23146976, + "step": 25590 + }, + { + "epoch": 6.755180150455326, + "grad_norm": 0.0013739368878304958, + "learning_rate": 0.08619707180967566, + "loss": 0.0545, + "num_input_tokens_seen": 23151424, + "step": 25595 + }, + { + "epoch": 6.756499934010822, + "grad_norm": 0.0020192109514027834, + "learning_rate": 0.08614376620478768, + "loss": 0.0644, + "num_input_tokens_seen": 23155872, + "step": 25600 + }, + { + "epoch": 6.756499934010822, + "eval_loss": 0.08727126568555832, + "eval_runtime": 76.0012, + "eval_samples_per_second": 88.617, + "eval_steps_per_second": 22.158, + "num_input_tokens_seen": 23155872, + "step": 25600 + }, + { + "epoch": 6.757819717566319, + "grad_norm": 0.0004268874181434512, + "learning_rate": 0.08609047044733344, + "loss": 0.0492, + "num_input_tokens_seen": 23160320, + "step": 25605 + }, + { + "epoch": 6.759139501121816, + "grad_norm": 0.001955681247636676, + "learning_rate": 0.08603718454553168, + "loss": 0.0277, + "num_input_tokens_seen": 23164768, + "step": 25610 + }, + { + "epoch": 6.760459284677313, + "grad_norm": 0.0027345209382474422, + "learning_rate": 0.08598390850759997, + "loss": 0.0472, + "num_input_tokens_seen": 23169376, + "step": 25615 + }, + { + "epoch": 6.76177906823281, + "grad_norm": 0.0011694645509123802, + "learning_rate": 0.08593064234175397, + "loss": 0.046, + "num_input_tokens_seen": 23173888, + "step": 25620 + }, + { + "epoch": 6.763098851788307, + "grad_norm": 0.006033760495483875, + "learning_rate": 0.08587738605620815, + "loss": 0.0546, + "num_input_tokens_seen": 23178592, + "step": 25625 + }, + { + "epoch": 6.764418635343803, + "grad_norm": 0.0014115606900304556, + "learning_rate": 0.08582413965917512, + "loss": 0.019, + "num_input_tokens_seen": 23182784, + "step": 25630 + }, + { + "epoch": 6.765738418899301, + "grad_norm": 0.0016950436402112246, + "learning_rate": 0.08577090315886628, + "loss": 0.0692, + "num_input_tokens_seen": 23187456, + "step": 25635 + }, + { + "epoch": 6.767058202454797, + "grad_norm": 0.00363357737660408, + "learning_rate": 0.08571767656349136, + "loss": 0.064, + "num_input_tokens_seen": 23192256, + "step": 25640 + }, + { + "epoch": 6.768377986010294, + "grad_norm": 0.0023430977016687393, + "learning_rate": 0.08566445988125847, + "loss": 0.0725, + "num_input_tokens_seen": 23196928, + "step": 25645 + }, + { + "epoch": 6.769697769565791, + "grad_norm": 0.0008525463053956628, + "learning_rate": 0.08561125312037436, + "loss": 0.0544, + "num_input_tokens_seen": 23201568, + "step": 25650 + }, + { + "epoch": 6.771017553121288, + "grad_norm": 0.0013251601485535502, + "learning_rate": 0.08555805628904424, + "loss": 0.0375, + "num_input_tokens_seen": 23206144, + "step": 25655 + }, + { + "epoch": 6.772337336676785, + "grad_norm": 0.0009995660511776805, + "learning_rate": 0.08550486939547161, + "loss": 0.0446, + "num_input_tokens_seen": 23210528, + "step": 25660 + }, + { + "epoch": 6.773657120232282, + "grad_norm": 0.0014477083459496498, + "learning_rate": 0.08545169244785869, + "loss": 0.0496, + "num_input_tokens_seen": 23215136, + "step": 25665 + }, + { + "epoch": 6.7749769037877785, + "grad_norm": 0.0019180235685780644, + "learning_rate": 0.08539852545440589, + "loss": 0.0376, + "num_input_tokens_seen": 23219520, + "step": 25670 + }, + { + "epoch": 6.776296687343276, + "grad_norm": 0.0003051624516956508, + "learning_rate": 0.08534536842331235, + "loss": 0.0805, + "num_input_tokens_seen": 23224352, + "step": 25675 + }, + { + "epoch": 6.7776164708987725, + "grad_norm": 0.00280876480974257, + "learning_rate": 0.08529222136277545, + "loss": 0.0822, + "num_input_tokens_seen": 23228512, + "step": 25680 + }, + { + "epoch": 6.778936254454269, + "grad_norm": 0.0012477007694542408, + "learning_rate": 0.08523908428099125, + "loss": 0.0781, + "num_input_tokens_seen": 23233344, + "step": 25685 + }, + { + "epoch": 6.7802560380097665, + "grad_norm": 0.005922955460846424, + "learning_rate": 0.08518595718615402, + "loss": 0.0807, + "num_input_tokens_seen": 23238048, + "step": 25690 + }, + { + "epoch": 6.781575821565263, + "grad_norm": 0.0017618556739762425, + "learning_rate": 0.08513284008645675, + "loss": 0.0714, + "num_input_tokens_seen": 23242432, + "step": 25695 + }, + { + "epoch": 6.7828956051207605, + "grad_norm": 0.0005460654501803219, + "learning_rate": 0.08507973299009065, + "loss": 0.0273, + "num_input_tokens_seen": 23246880, + "step": 25700 + }, + { + "epoch": 6.784215388676257, + "grad_norm": 0.003442070446908474, + "learning_rate": 0.08502663590524563, + "loss": 0.0844, + "num_input_tokens_seen": 23251424, + "step": 25705 + }, + { + "epoch": 6.785535172231754, + "grad_norm": 0.002331838943064213, + "learning_rate": 0.08497354884010981, + "loss": 0.0491, + "num_input_tokens_seen": 23255776, + "step": 25710 + }, + { + "epoch": 6.786854955787251, + "grad_norm": 0.0030648009851574898, + "learning_rate": 0.0849204718028699, + "loss": 0.0632, + "num_input_tokens_seen": 23260576, + "step": 25715 + }, + { + "epoch": 6.788174739342748, + "grad_norm": 0.0011657406575977802, + "learning_rate": 0.08486740480171118, + "loss": 0.022, + "num_input_tokens_seen": 23265056, + "step": 25720 + }, + { + "epoch": 6.789494522898245, + "grad_norm": 0.003991090226918459, + "learning_rate": 0.08481434784481706, + "loss": 0.0633, + "num_input_tokens_seen": 23269824, + "step": 25725 + }, + { + "epoch": 6.790814306453742, + "grad_norm": 0.0022852111142128706, + "learning_rate": 0.08476130094036968, + "loss": 0.0385, + "num_input_tokens_seen": 23274304, + "step": 25730 + }, + { + "epoch": 6.792134090009238, + "grad_norm": 0.002244584495201707, + "learning_rate": 0.08470826409654961, + "loss": 0.0569, + "num_input_tokens_seen": 23279072, + "step": 25735 + }, + { + "epoch": 6.793453873564736, + "grad_norm": 0.0013657411327585578, + "learning_rate": 0.08465523732153564, + "loss": 0.0539, + "num_input_tokens_seen": 23283488, + "step": 25740 + }, + { + "epoch": 6.794773657120232, + "grad_norm": 0.0004662269784603268, + "learning_rate": 0.08460222062350532, + "loss": 0.0286, + "num_input_tokens_seen": 23288224, + "step": 25745 + }, + { + "epoch": 6.79609344067573, + "grad_norm": 0.003324692603200674, + "learning_rate": 0.08454921401063442, + "loss": 0.078, + "num_input_tokens_seen": 23292832, + "step": 25750 + }, + { + "epoch": 6.797413224231226, + "grad_norm": 0.0025624253321439028, + "learning_rate": 0.08449621749109716, + "loss": 0.0316, + "num_input_tokens_seen": 23297408, + "step": 25755 + }, + { + "epoch": 6.798733007786723, + "grad_norm": 0.001286323182284832, + "learning_rate": 0.08444323107306641, + "loss": 0.0357, + "num_input_tokens_seen": 23301792, + "step": 25760 + }, + { + "epoch": 6.80005279134222, + "grad_norm": 0.0038129824679344893, + "learning_rate": 0.0843902547647132, + "loss": 0.0516, + "num_input_tokens_seen": 23305888, + "step": 25765 + }, + { + "epoch": 6.801372574897717, + "grad_norm": 0.0009123384952545166, + "learning_rate": 0.0843372885742072, + "loss": 0.034, + "num_input_tokens_seen": 23310144, + "step": 25770 + }, + { + "epoch": 6.802692358453213, + "grad_norm": 0.0010467343963682652, + "learning_rate": 0.08428433250971652, + "loss": 0.0862, + "num_input_tokens_seen": 23314944, + "step": 25775 + }, + { + "epoch": 6.804012142008711, + "grad_norm": 0.0017206751508638263, + "learning_rate": 0.08423138657940757, + "loss": 0.0474, + "num_input_tokens_seen": 23319520, + "step": 25780 + }, + { + "epoch": 6.805331925564207, + "grad_norm": 0.0003329447645228356, + "learning_rate": 0.08417845079144536, + "loss": 0.0275, + "num_input_tokens_seen": 23324032, + "step": 25785 + }, + { + "epoch": 6.806651709119705, + "grad_norm": 0.0010873754508793354, + "learning_rate": 0.08412552515399314, + "loss": 0.0352, + "num_input_tokens_seen": 23328768, + "step": 25790 + }, + { + "epoch": 6.807971492675201, + "grad_norm": 0.0038649151101708412, + "learning_rate": 0.08407260967521278, + "loss": 0.0678, + "num_input_tokens_seen": 23333536, + "step": 25795 + }, + { + "epoch": 6.809291276230698, + "grad_norm": 0.0011461342219263315, + "learning_rate": 0.08401970436326454, + "loss": 0.0625, + "num_input_tokens_seen": 23338048, + "step": 25800 + }, + { + "epoch": 6.809291276230698, + "eval_loss": 0.08600080758333206, + "eval_runtime": 75.7966, + "eval_samples_per_second": 88.856, + "eval_steps_per_second": 22.217, + "num_input_tokens_seen": 23338048, + "step": 25800 + }, + { + "epoch": 6.810611059786195, + "grad_norm": 0.001547355088405311, + "learning_rate": 0.08396680922630702, + "loss": 0.0502, + "num_input_tokens_seen": 23342336, + "step": 25805 + }, + { + "epoch": 6.811930843341692, + "grad_norm": 0.0021411883644759655, + "learning_rate": 0.08391392427249732, + "loss": 0.0422, + "num_input_tokens_seen": 23346496, + "step": 25810 + }, + { + "epoch": 6.813250626897188, + "grad_norm": 0.0022955776657909155, + "learning_rate": 0.08386104950999107, + "loss": 0.0517, + "num_input_tokens_seen": 23351424, + "step": 25815 + }, + { + "epoch": 6.814570410452686, + "grad_norm": 0.0011499160900712013, + "learning_rate": 0.0838081849469421, + "loss": 0.041, + "num_input_tokens_seen": 23356032, + "step": 25820 + }, + { + "epoch": 6.815890194008182, + "grad_norm": 0.0021970232482999563, + "learning_rate": 0.08375533059150281, + "loss": 0.0745, + "num_input_tokens_seen": 23360416, + "step": 25825 + }, + { + "epoch": 6.81720997756368, + "grad_norm": 0.0039553516544401646, + "learning_rate": 0.08370248645182406, + "loss": 0.0617, + "num_input_tokens_seen": 23364672, + "step": 25830 + }, + { + "epoch": 6.818529761119176, + "grad_norm": 0.0013658624375239015, + "learning_rate": 0.083649652536055, + "loss": 0.0243, + "num_input_tokens_seen": 23369056, + "step": 25835 + }, + { + "epoch": 6.819849544674673, + "grad_norm": 0.0019751503132283688, + "learning_rate": 0.08359682885234339, + "loss": 0.0422, + "num_input_tokens_seen": 23373600, + "step": 25840 + }, + { + "epoch": 6.82116932823017, + "grad_norm": 0.0013033457798883319, + "learning_rate": 0.08354401540883516, + "loss": 0.0441, + "num_input_tokens_seen": 23378144, + "step": 25845 + }, + { + "epoch": 6.822489111785667, + "grad_norm": 0.0007604346028529108, + "learning_rate": 0.0834912122136749, + "loss": 0.0428, + "num_input_tokens_seen": 23382848, + "step": 25850 + }, + { + "epoch": 6.823808895341164, + "grad_norm": 0.0017225499032065272, + "learning_rate": 0.0834384192750056, + "loss": 0.0485, + "num_input_tokens_seen": 23387456, + "step": 25855 + }, + { + "epoch": 6.825128678896661, + "grad_norm": 0.0005341400974430144, + "learning_rate": 0.08338563660096844, + "loss": 0.0363, + "num_input_tokens_seen": 23391872, + "step": 25860 + }, + { + "epoch": 6.8264484624521575, + "grad_norm": 0.003160015447065234, + "learning_rate": 0.08333286419970329, + "loss": 0.0652, + "num_input_tokens_seen": 23396736, + "step": 25865 + }, + { + "epoch": 6.827768246007655, + "grad_norm": 0.0034771126229315996, + "learning_rate": 0.08328010207934824, + "loss": 0.0693, + "num_input_tokens_seen": 23401088, + "step": 25870 + }, + { + "epoch": 6.8290880295631515, + "grad_norm": 0.0018580929609015584, + "learning_rate": 0.08322735024803989, + "loss": 0.0646, + "num_input_tokens_seen": 23405376, + "step": 25875 + }, + { + "epoch": 6.830407813118649, + "grad_norm": 0.0031377284321933985, + "learning_rate": 0.08317460871391331, + "loss": 0.0562, + "num_input_tokens_seen": 23410080, + "step": 25880 + }, + { + "epoch": 6.8317275966741455, + "grad_norm": 0.0030870160553604364, + "learning_rate": 0.08312187748510179, + "loss": 0.0543, + "num_input_tokens_seen": 23414528, + "step": 25885 + }, + { + "epoch": 6.833047380229642, + "grad_norm": 0.0011010462185367942, + "learning_rate": 0.08306915656973726, + "loss": 0.0285, + "num_input_tokens_seen": 23419136, + "step": 25890 + }, + { + "epoch": 6.8343671637851395, + "grad_norm": 0.003396181855350733, + "learning_rate": 0.08301644597594988, + "loss": 0.0982, + "num_input_tokens_seen": 23423488, + "step": 25895 + }, + { + "epoch": 6.835686947340636, + "grad_norm": 0.004130094777792692, + "learning_rate": 0.08296374571186826, + "loss": 0.0388, + "num_input_tokens_seen": 23427968, + "step": 25900 + }, + { + "epoch": 6.8370067308961335, + "grad_norm": 0.0009075581911019981, + "learning_rate": 0.08291105578561955, + "loss": 0.0516, + "num_input_tokens_seen": 23432448, + "step": 25905 + }, + { + "epoch": 6.83832651445163, + "grad_norm": 0.0038684371393173933, + "learning_rate": 0.08285837620532904, + "loss": 0.0307, + "num_input_tokens_seen": 23437056, + "step": 25910 + }, + { + "epoch": 6.839646298007127, + "grad_norm": 0.004664561711251736, + "learning_rate": 0.0828057069791207, + "loss": 0.0408, + "num_input_tokens_seen": 23441568, + "step": 25915 + }, + { + "epoch": 6.840966081562624, + "grad_norm": 0.002045167377218604, + "learning_rate": 0.0827530481151168, + "loss": 0.0647, + "num_input_tokens_seen": 23446272, + "step": 25920 + }, + { + "epoch": 6.842285865118121, + "grad_norm": 0.001134679070673883, + "learning_rate": 0.08270039962143792, + "loss": 0.0854, + "num_input_tokens_seen": 23450912, + "step": 25925 + }, + { + "epoch": 6.843605648673617, + "grad_norm": 0.00036347523564472795, + "learning_rate": 0.08264776150620314, + "loss": 0.0588, + "num_input_tokens_seen": 23455584, + "step": 25930 + }, + { + "epoch": 6.844925432229115, + "grad_norm": 0.0013198817614465952, + "learning_rate": 0.08259513377753, + "loss": 0.0735, + "num_input_tokens_seen": 23459904, + "step": 25935 + }, + { + "epoch": 6.846245215784611, + "grad_norm": 0.002100499579682946, + "learning_rate": 0.08254251644353423, + "loss": 0.0478, + "num_input_tokens_seen": 23464608, + "step": 25940 + }, + { + "epoch": 6.847564999340108, + "grad_norm": 0.0016683714929968119, + "learning_rate": 0.08248990951233022, + "loss": 0.054, + "num_input_tokens_seen": 23469184, + "step": 25945 + }, + { + "epoch": 6.848884782895605, + "grad_norm": 0.0025848387740552425, + "learning_rate": 0.08243731299203048, + "loss": 0.083, + "num_input_tokens_seen": 23473760, + "step": 25950 + }, + { + "epoch": 6.850204566451102, + "grad_norm": 0.002390325767919421, + "learning_rate": 0.08238472689074612, + "loss": 0.0506, + "num_input_tokens_seen": 23478400, + "step": 25955 + }, + { + "epoch": 6.851524350006599, + "grad_norm": 0.001968952128663659, + "learning_rate": 0.08233215121658666, + "loss": 0.0843, + "num_input_tokens_seen": 23482944, + "step": 25960 + }, + { + "epoch": 6.852844133562096, + "grad_norm": 0.0026855147443711758, + "learning_rate": 0.08227958597765982, + "loss": 0.0456, + "num_input_tokens_seen": 23487680, + "step": 25965 + }, + { + "epoch": 6.854163917117592, + "grad_norm": 0.0017183548770844936, + "learning_rate": 0.08222703118207181, + "loss": 0.045, + "num_input_tokens_seen": 23492224, + "step": 25970 + }, + { + "epoch": 6.85548370067309, + "grad_norm": 0.0005136047257110476, + "learning_rate": 0.08217448683792734, + "loss": 0.0445, + "num_input_tokens_seen": 23496640, + "step": 25975 + }, + { + "epoch": 6.856803484228586, + "grad_norm": 0.0028911714907735586, + "learning_rate": 0.08212195295332926, + "loss": 0.0603, + "num_input_tokens_seen": 23501088, + "step": 25980 + }, + { + "epoch": 6.858123267784084, + "grad_norm": 0.0008739863405935466, + "learning_rate": 0.08206942953637915, + "loss": 0.0565, + "num_input_tokens_seen": 23505632, + "step": 25985 + }, + { + "epoch": 6.85944305133958, + "grad_norm": 0.0026228714268654585, + "learning_rate": 0.08201691659517658, + "loss": 0.0978, + "num_input_tokens_seen": 23510112, + "step": 25990 + }, + { + "epoch": 6.860762834895077, + "grad_norm": 0.003855934366583824, + "learning_rate": 0.08196441413781981, + "loss": 0.0657, + "num_input_tokens_seen": 23514624, + "step": 25995 + }, + { + "epoch": 6.862082618450574, + "grad_norm": 0.0031645791605114937, + "learning_rate": 0.08191192217240544, + "loss": 0.0601, + "num_input_tokens_seen": 23518912, + "step": 26000 + }, + { + "epoch": 6.862082618450574, + "eval_loss": 0.08528272807598114, + "eval_runtime": 76.0851, + "eval_samples_per_second": 88.519, + "eval_steps_per_second": 22.133, + "num_input_tokens_seen": 23518912, + "step": 26000 + }, + { + "epoch": 6.863402402006071, + "grad_norm": 0.0012299215886741877, + "learning_rate": 0.08185944070702823, + "loss": 0.048, + "num_input_tokens_seen": 23523296, + "step": 26005 + }, + { + "epoch": 6.864722185561568, + "grad_norm": 0.0008572808001190424, + "learning_rate": 0.08180696974978159, + "loss": 0.0456, + "num_input_tokens_seen": 23527808, + "step": 26010 + }, + { + "epoch": 6.866041969117065, + "grad_norm": 0.0019217351218685508, + "learning_rate": 0.08175450930875724, + "loss": 0.0672, + "num_input_tokens_seen": 23531808, + "step": 26015 + }, + { + "epoch": 6.867361752672561, + "grad_norm": 0.0012801805278286338, + "learning_rate": 0.08170205939204513, + "loss": 0.057, + "num_input_tokens_seen": 23536544, + "step": 26020 + }, + { + "epoch": 6.868681536228059, + "grad_norm": 0.0028205725830048323, + "learning_rate": 0.08164962000773379, + "loss": 0.0523, + "num_input_tokens_seen": 23541376, + "step": 26025 + }, + { + "epoch": 6.870001319783555, + "grad_norm": 0.0007412516861222684, + "learning_rate": 0.08159719116390995, + "loss": 0.0593, + "num_input_tokens_seen": 23546048, + "step": 26030 + }, + { + "epoch": 6.871321103339053, + "grad_norm": 0.0019196842331439257, + "learning_rate": 0.08154477286865887, + "loss": 0.0921, + "num_input_tokens_seen": 23550464, + "step": 26035 + }, + { + "epoch": 6.872640886894549, + "grad_norm": 0.00140467775054276, + "learning_rate": 0.08149236513006404, + "loss": 0.047, + "num_input_tokens_seen": 23555136, + "step": 26040 + }, + { + "epoch": 6.873960670450046, + "grad_norm": 0.0010803737677633762, + "learning_rate": 0.08143996795620746, + "loss": 0.0403, + "num_input_tokens_seen": 23559904, + "step": 26045 + }, + { + "epoch": 6.875280454005543, + "grad_norm": 0.002509390702471137, + "learning_rate": 0.08138758135516938, + "loss": 0.0884, + "num_input_tokens_seen": 23564576, + "step": 26050 + }, + { + "epoch": 6.87660023756104, + "grad_norm": 0.0029893929604440928, + "learning_rate": 0.08133520533502851, + "loss": 0.056, + "num_input_tokens_seen": 23569088, + "step": 26055 + }, + { + "epoch": 6.8779200211165366, + "grad_norm": 0.0008678185404278338, + "learning_rate": 0.08128283990386184, + "loss": 0.0618, + "num_input_tokens_seen": 23573472, + "step": 26060 + }, + { + "epoch": 6.879239804672034, + "grad_norm": 0.0014861170202493668, + "learning_rate": 0.08123048506974488, + "loss": 0.0615, + "num_input_tokens_seen": 23577824, + "step": 26065 + }, + { + "epoch": 6.8805595882275306, + "grad_norm": 0.0010495021706447005, + "learning_rate": 0.08117814084075124, + "loss": 0.0382, + "num_input_tokens_seen": 23582336, + "step": 26070 + }, + { + "epoch": 6.881879371783027, + "grad_norm": 0.0037072720006108284, + "learning_rate": 0.08112580722495318, + "loss": 0.0715, + "num_input_tokens_seen": 23586816, + "step": 26075 + }, + { + "epoch": 6.8831991553385246, + "grad_norm": 0.0030215203296393156, + "learning_rate": 0.08107348423042122, + "loss": 0.05, + "num_input_tokens_seen": 23591264, + "step": 26080 + }, + { + "epoch": 6.884518938894021, + "grad_norm": 0.0029031310696154833, + "learning_rate": 0.08102117186522413, + "loss": 0.042, + "num_input_tokens_seen": 23596384, + "step": 26085 + }, + { + "epoch": 6.885838722449519, + "grad_norm": 0.0007879711920395494, + "learning_rate": 0.08096887013742916, + "loss": 0.0412, + "num_input_tokens_seen": 23600768, + "step": 26090 + }, + { + "epoch": 6.887158506005015, + "grad_norm": 0.0005465224967338145, + "learning_rate": 0.08091657905510198, + "loss": 0.0344, + "num_input_tokens_seen": 23605184, + "step": 26095 + }, + { + "epoch": 6.888478289560512, + "grad_norm": 0.002945821499451995, + "learning_rate": 0.08086429862630642, + "loss": 0.0642, + "num_input_tokens_seen": 23609920, + "step": 26100 + }, + { + "epoch": 6.889798073116009, + "grad_norm": 0.0023454935289919376, + "learning_rate": 0.08081202885910488, + "loss": 0.0277, + "num_input_tokens_seen": 23614656, + "step": 26105 + }, + { + "epoch": 6.891117856671506, + "grad_norm": 0.00047852713032625616, + "learning_rate": 0.08075976976155795, + "loss": 0.073, + "num_input_tokens_seen": 23619168, + "step": 26110 + }, + { + "epoch": 6.892437640227003, + "grad_norm": 0.0004352498799562454, + "learning_rate": 0.08070752134172461, + "loss": 0.0246, + "num_input_tokens_seen": 23623808, + "step": 26115 + }, + { + "epoch": 6.8937574237825, + "grad_norm": 0.0027129105292260647, + "learning_rate": 0.08065528360766229, + "loss": 0.0392, + "num_input_tokens_seen": 23628320, + "step": 26120 + }, + { + "epoch": 6.895077207337996, + "grad_norm": 0.0014722361229360104, + "learning_rate": 0.08060305656742664, + "loss": 0.0267, + "num_input_tokens_seen": 23632736, + "step": 26125 + }, + { + "epoch": 6.896396990893494, + "grad_norm": 0.002649789908900857, + "learning_rate": 0.08055084022907182, + "loss": 0.0714, + "num_input_tokens_seen": 23637088, + "step": 26130 + }, + { + "epoch": 6.89771677444899, + "grad_norm": 0.0004721966397482902, + "learning_rate": 0.08049863460065014, + "loss": 0.0674, + "num_input_tokens_seen": 23641696, + "step": 26135 + }, + { + "epoch": 6.899036558004488, + "grad_norm": 0.0012406143359839916, + "learning_rate": 0.0804464396902124, + "loss": 0.0302, + "num_input_tokens_seen": 23646080, + "step": 26140 + }, + { + "epoch": 6.900356341559984, + "grad_norm": 0.0006130090914666653, + "learning_rate": 0.08039425550580777, + "loss": 0.0644, + "num_input_tokens_seen": 23650784, + "step": 26145 + }, + { + "epoch": 6.901676125115481, + "grad_norm": 0.001774326665326953, + "learning_rate": 0.08034208205548363, + "loss": 0.0317, + "num_input_tokens_seen": 23655040, + "step": 26150 + }, + { + "epoch": 6.902995908670978, + "grad_norm": 0.0019961223006248474, + "learning_rate": 0.08028991934728581, + "loss": 0.0438, + "num_input_tokens_seen": 23659424, + "step": 26155 + }, + { + "epoch": 6.904315692226475, + "grad_norm": 0.0024280447978526354, + "learning_rate": 0.0802377673892585, + "loss": 0.0421, + "num_input_tokens_seen": 23664096, + "step": 26160 + }, + { + "epoch": 6.905635475781972, + "grad_norm": 0.0027206127997487783, + "learning_rate": 0.0801856261894441, + "loss": 0.0677, + "num_input_tokens_seen": 23668704, + "step": 26165 + }, + { + "epoch": 6.906955259337469, + "grad_norm": 0.0024896995164453983, + "learning_rate": 0.08013349575588354, + "loss": 0.0416, + "num_input_tokens_seen": 23673312, + "step": 26170 + }, + { + "epoch": 6.908275042892965, + "grad_norm": 0.00022333832748699933, + "learning_rate": 0.08008137609661586, + "loss": 0.0549, + "num_input_tokens_seen": 23678016, + "step": 26175 + }, + { + "epoch": 6.909594826448463, + "grad_norm": 0.0031984078232198954, + "learning_rate": 0.08002926721967872, + "loss": 0.0488, + "num_input_tokens_seen": 23682656, + "step": 26180 + }, + { + "epoch": 6.910914610003959, + "grad_norm": 0.000568473944440484, + "learning_rate": 0.07997716913310782, + "loss": 0.0586, + "num_input_tokens_seen": 23687168, + "step": 26185 + }, + { + "epoch": 6.912234393559456, + "grad_norm": 0.0013255465310066938, + "learning_rate": 0.07992508184493745, + "loss": 0.0203, + "num_input_tokens_seen": 23691456, + "step": 26190 + }, + { + "epoch": 6.913554177114953, + "grad_norm": 0.0016959450440481305, + "learning_rate": 0.07987300536320001, + "loss": 0.0719, + "num_input_tokens_seen": 23695872, + "step": 26195 + }, + { + "epoch": 6.91487396067045, + "grad_norm": 0.0023119961842894554, + "learning_rate": 0.07982093969592649, + "loss": 0.0691, + "num_input_tokens_seen": 23700448, + "step": 26200 + }, + { + "epoch": 6.91487396067045, + "eval_loss": 0.08348097652196884, + "eval_runtime": 75.8682, + "eval_samples_per_second": 88.772, + "eval_steps_per_second": 22.196, + "num_input_tokens_seen": 23700448, + "step": 26200 + }, + { + "epoch": 6.9161937442259465, + "grad_norm": 0.0005365968099795282, + "learning_rate": 0.07976888485114592, + "loss": 0.0406, + "num_input_tokens_seen": 23705088, + "step": 26205 + }, + { + "epoch": 6.917513527781444, + "grad_norm": 0.001819234574213624, + "learning_rate": 0.07971684083688595, + "loss": 0.066, + "num_input_tokens_seen": 23709568, + "step": 26210 + }, + { + "epoch": 6.9188333113369405, + "grad_norm": 0.0011878128862008452, + "learning_rate": 0.0796648076611723, + "loss": 0.047, + "num_input_tokens_seen": 23714464, + "step": 26215 + }, + { + "epoch": 6.920153094892438, + "grad_norm": 0.0014535346999764442, + "learning_rate": 0.07961278533202922, + "loss": 0.0494, + "num_input_tokens_seen": 23719040, + "step": 26220 + }, + { + "epoch": 6.9214728784479345, + "grad_norm": 0.0021561002358794212, + "learning_rate": 0.07956077385747919, + "loss": 0.0447, + "num_input_tokens_seen": 23723488, + "step": 26225 + }, + { + "epoch": 6.922792662003431, + "grad_norm": 0.0021482629235833883, + "learning_rate": 0.079508773245543, + "loss": 0.0536, + "num_input_tokens_seen": 23727968, + "step": 26230 + }, + { + "epoch": 6.9241124455589285, + "grad_norm": 0.002440907759591937, + "learning_rate": 0.07945678350423982, + "loss": 0.0392, + "num_input_tokens_seen": 23732608, + "step": 26235 + }, + { + "epoch": 6.925432229114425, + "grad_norm": 0.0029327983502298594, + "learning_rate": 0.07940480464158717, + "loss": 0.0331, + "num_input_tokens_seen": 23737312, + "step": 26240 + }, + { + "epoch": 6.9267520126699225, + "grad_norm": 0.0013044332154095173, + "learning_rate": 0.07935283666560076, + "loss": 0.0381, + "num_input_tokens_seen": 23742016, + "step": 26245 + }, + { + "epoch": 6.928071796225419, + "grad_norm": 0.0025779602583497763, + "learning_rate": 0.07930087958429478, + "loss": 0.0385, + "num_input_tokens_seen": 23746528, + "step": 26250 + }, + { + "epoch": 6.929391579780916, + "grad_norm": 0.0017493761843070388, + "learning_rate": 0.07924893340568159, + "loss": 0.0349, + "num_input_tokens_seen": 23751072, + "step": 26255 + }, + { + "epoch": 6.930711363336413, + "grad_norm": 0.0032777111046016216, + "learning_rate": 0.07919699813777205, + "loss": 0.0346, + "num_input_tokens_seen": 23755456, + "step": 26260 + }, + { + "epoch": 6.93203114689191, + "grad_norm": 0.004119204357266426, + "learning_rate": 0.07914507378857515, + "loss": 0.0669, + "num_input_tokens_seen": 23759808, + "step": 26265 + }, + { + "epoch": 6.933350930447407, + "grad_norm": 0.0015580987092107534, + "learning_rate": 0.07909316036609822, + "loss": 0.0604, + "num_input_tokens_seen": 23764480, + "step": 26270 + }, + { + "epoch": 6.934670714002904, + "grad_norm": 0.0027699293568730354, + "learning_rate": 0.07904125787834704, + "loss": 0.0396, + "num_input_tokens_seen": 23768992, + "step": 26275 + }, + { + "epoch": 6.9359904975584, + "grad_norm": 0.0011010079178959131, + "learning_rate": 0.07898936633332569, + "loss": 0.0205, + "num_input_tokens_seen": 23773280, + "step": 26280 + }, + { + "epoch": 6.937310281113898, + "grad_norm": 0.00639279093593359, + "learning_rate": 0.07893748573903635, + "loss": 0.1052, + "num_input_tokens_seen": 23777728, + "step": 26285 + }, + { + "epoch": 6.938630064669394, + "grad_norm": 0.0022382766474038363, + "learning_rate": 0.0788856161034798, + "loss": 0.0645, + "num_input_tokens_seen": 23782112, + "step": 26290 + }, + { + "epoch": 6.939949848224892, + "grad_norm": 0.0015746960416436195, + "learning_rate": 0.07883375743465487, + "loss": 0.0234, + "num_input_tokens_seen": 23786848, + "step": 26295 + }, + { + "epoch": 6.941269631780388, + "grad_norm": 0.0031323737930506468, + "learning_rate": 0.07878190974055888, + "loss": 0.0476, + "num_input_tokens_seen": 23791424, + "step": 26300 + }, + { + "epoch": 6.942589415335885, + "grad_norm": 0.002113193506374955, + "learning_rate": 0.07873007302918746, + "loss": 0.0263, + "num_input_tokens_seen": 23795744, + "step": 26305 + }, + { + "epoch": 6.943909198891382, + "grad_norm": 0.0006760316318832338, + "learning_rate": 0.07867824730853433, + "loss": 0.0558, + "num_input_tokens_seen": 23800192, + "step": 26310 + }, + { + "epoch": 6.945228982446879, + "grad_norm": 0.002340424107387662, + "learning_rate": 0.07862643258659176, + "loss": 0.0368, + "num_input_tokens_seen": 23804768, + "step": 26315 + }, + { + "epoch": 6.946548766002375, + "grad_norm": 0.001996312290430069, + "learning_rate": 0.07857462887135026, + "loss": 0.0243, + "num_input_tokens_seen": 23809248, + "step": 26320 + }, + { + "epoch": 6.947868549557873, + "grad_norm": 0.0033965182956308126, + "learning_rate": 0.0785228361707986, + "loss": 0.0245, + "num_input_tokens_seen": 23813984, + "step": 26325 + }, + { + "epoch": 6.949188333113369, + "grad_norm": 0.0025249451864510775, + "learning_rate": 0.07847105449292378, + "loss": 0.0364, + "num_input_tokens_seen": 23818688, + "step": 26330 + }, + { + "epoch": 6.950508116668866, + "grad_norm": 0.002393353031948209, + "learning_rate": 0.0784192838457113, + "loss": 0.0587, + "num_input_tokens_seen": 23823008, + "step": 26335 + }, + { + "epoch": 6.951827900224363, + "grad_norm": 0.0029224930331110954, + "learning_rate": 0.07836752423714473, + "loss": 0.0353, + "num_input_tokens_seen": 23827328, + "step": 26340 + }, + { + "epoch": 6.95314768377986, + "grad_norm": 0.001999335363507271, + "learning_rate": 0.07831577567520616, + "loss": 0.0537, + "num_input_tokens_seen": 23831840, + "step": 26345 + }, + { + "epoch": 6.954467467335357, + "grad_norm": 0.0016212305054068565, + "learning_rate": 0.07826403816787579, + "loss": 0.037, + "num_input_tokens_seen": 23835936, + "step": 26350 + }, + { + "epoch": 6.955787250890854, + "grad_norm": 0.0034896647557616234, + "learning_rate": 0.0782123117231322, + "loss": 0.0669, + "num_input_tokens_seen": 23840544, + "step": 26355 + }, + { + "epoch": 6.95710703444635, + "grad_norm": 0.0007901648059487343, + "learning_rate": 0.07816059634895237, + "loss": 0.0327, + "num_input_tokens_seen": 23845120, + "step": 26360 + }, + { + "epoch": 6.958426818001848, + "grad_norm": 0.0038912647869437933, + "learning_rate": 0.0781088920533113, + "loss": 0.0566, + "num_input_tokens_seen": 23849376, + "step": 26365 + }, + { + "epoch": 6.959746601557344, + "grad_norm": 0.0011947921011596918, + "learning_rate": 0.07805719884418257, + "loss": 0.0847, + "num_input_tokens_seen": 23853792, + "step": 26370 + }, + { + "epoch": 6.961066385112842, + "grad_norm": 0.003031352302059531, + "learning_rate": 0.07800551672953779, + "loss": 0.0476, + "num_input_tokens_seen": 23858208, + "step": 26375 + }, + { + "epoch": 6.962386168668338, + "grad_norm": 0.00096204096917063, + "learning_rate": 0.07795384571734709, + "loss": 0.0364, + "num_input_tokens_seen": 23862656, + "step": 26380 + }, + { + "epoch": 6.963705952223835, + "grad_norm": 0.0006294125923886895, + "learning_rate": 0.07790218581557883, + "loss": 0.0584, + "num_input_tokens_seen": 23867680, + "step": 26385 + }, + { + "epoch": 6.965025735779332, + "grad_norm": 0.0018268307903781533, + "learning_rate": 0.07785053703219949, + "loss": 0.0658, + "num_input_tokens_seen": 23872096, + "step": 26390 + }, + { + "epoch": 6.966345519334829, + "grad_norm": 0.0005553217488341033, + "learning_rate": 0.07779889937517409, + "loss": 0.0231, + "num_input_tokens_seen": 23876192, + "step": 26395 + }, + { + "epoch": 6.967665302890326, + "grad_norm": 0.0005184730980545282, + "learning_rate": 0.0777472728524657, + "loss": 0.0419, + "num_input_tokens_seen": 23880704, + "step": 26400 + }, + { + "epoch": 6.967665302890326, + "eval_loss": 0.0840541198849678, + "eval_runtime": 75.8713, + "eval_samples_per_second": 88.769, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 23880704, + "step": 26400 + }, + { + "epoch": 6.968985086445823, + "grad_norm": 0.004455056041479111, + "learning_rate": 0.07769565747203584, + "loss": 0.0567, + "num_input_tokens_seen": 23885088, + "step": 26405 + }, + { + "epoch": 6.9703048700013195, + "grad_norm": 0.0012844683369621634, + "learning_rate": 0.07764405324184427, + "loss": 0.029, + "num_input_tokens_seen": 23889440, + "step": 26410 + }, + { + "epoch": 6.971624653556817, + "grad_norm": 0.0013151667080819607, + "learning_rate": 0.07759246016984889, + "loss": 0.0308, + "num_input_tokens_seen": 23893696, + "step": 26415 + }, + { + "epoch": 6.9729444371123135, + "grad_norm": 0.0033828187733888626, + "learning_rate": 0.07754087826400609, + "loss": 0.0689, + "num_input_tokens_seen": 23898400, + "step": 26420 + }, + { + "epoch": 6.974264220667811, + "grad_norm": 0.0011288435198366642, + "learning_rate": 0.0774893075322705, + "loss": 0.0468, + "num_input_tokens_seen": 23902880, + "step": 26425 + }, + { + "epoch": 6.9755840042233075, + "grad_norm": 0.0005421416717581451, + "learning_rate": 0.07743774798259484, + "loss": 0.0345, + "num_input_tokens_seen": 23907392, + "step": 26430 + }, + { + "epoch": 6.976903787778804, + "grad_norm": 0.0013200609246268868, + "learning_rate": 0.07738619962293032, + "loss": 0.0566, + "num_input_tokens_seen": 23912064, + "step": 26435 + }, + { + "epoch": 6.9782235713343015, + "grad_norm": 0.0033317902125418186, + "learning_rate": 0.0773346624612264, + "loss": 0.0463, + "num_input_tokens_seen": 23916576, + "step": 26440 + }, + { + "epoch": 6.979543354889798, + "grad_norm": 0.0019981013610959053, + "learning_rate": 0.07728313650543066, + "loss": 0.0405, + "num_input_tokens_seen": 23920896, + "step": 26445 + }, + { + "epoch": 6.980863138445295, + "grad_norm": 0.0021418873220682144, + "learning_rate": 0.07723162176348913, + "loss": 0.0544, + "num_input_tokens_seen": 23925568, + "step": 26450 + }, + { + "epoch": 6.982182922000792, + "grad_norm": 0.0019386905478313565, + "learning_rate": 0.07718011824334593, + "loss": 0.0346, + "num_input_tokens_seen": 23930176, + "step": 26455 + }, + { + "epoch": 6.983502705556289, + "grad_norm": 0.0019862668123096228, + "learning_rate": 0.07712862595294363, + "loss": 0.063, + "num_input_tokens_seen": 23934784, + "step": 26460 + }, + { + "epoch": 6.984822489111786, + "grad_norm": 0.0005184879410080612, + "learning_rate": 0.07707714490022301, + "loss": 0.0244, + "num_input_tokens_seen": 23939360, + "step": 26465 + }, + { + "epoch": 6.986142272667283, + "grad_norm": 0.0038261692970991135, + "learning_rate": 0.07702567509312298, + "loss": 0.0995, + "num_input_tokens_seen": 23943936, + "step": 26470 + }, + { + "epoch": 6.987462056222779, + "grad_norm": 0.0032676318660378456, + "learning_rate": 0.07697421653958098, + "loss": 0.0385, + "num_input_tokens_seen": 23948608, + "step": 26475 + }, + { + "epoch": 6.988781839778277, + "grad_norm": 0.0015559736639261246, + "learning_rate": 0.07692276924753247, + "loss": 0.0496, + "num_input_tokens_seen": 23953440, + "step": 26480 + }, + { + "epoch": 6.990101623333773, + "grad_norm": 0.003456976031884551, + "learning_rate": 0.07687133322491124, + "loss": 0.0376, + "num_input_tokens_seen": 23958272, + "step": 26485 + }, + { + "epoch": 6.99142140688927, + "grad_norm": 0.001006866805255413, + "learning_rate": 0.07681990847964948, + "loss": 0.0448, + "num_input_tokens_seen": 23962560, + "step": 26490 + }, + { + "epoch": 6.992741190444767, + "grad_norm": 0.0010299331042915583, + "learning_rate": 0.0767684950196774, + "loss": 0.0209, + "num_input_tokens_seen": 23967072, + "step": 26495 + }, + { + "epoch": 6.994060974000264, + "grad_norm": 0.002588722389191389, + "learning_rate": 0.0767170928529237, + "loss": 0.0815, + "num_input_tokens_seen": 23971456, + "step": 26500 + }, + { + "epoch": 6.995380757555761, + "grad_norm": 0.0030673586297780275, + "learning_rate": 0.07666570198731526, + "loss": 0.0237, + "num_input_tokens_seen": 23976096, + "step": 26505 + }, + { + "epoch": 6.996700541111258, + "grad_norm": 0.003740586107596755, + "learning_rate": 0.07661432243077708, + "loss": 0.036, + "num_input_tokens_seen": 23980608, + "step": 26510 + }, + { + "epoch": 6.998020324666754, + "grad_norm": 0.0020384921226650476, + "learning_rate": 0.0765629541912326, + "loss": 0.0495, + "num_input_tokens_seen": 23985248, + "step": 26515 + }, + { + "epoch": 6.999340108222252, + "grad_norm": 0.00344674801453948, + "learning_rate": 0.07651159727660352, + "loss": 0.0487, + "num_input_tokens_seen": 23989856, + "step": 26520 + }, + { + "epoch": 7.000527913422199, + "grad_norm": 0.0007907776162028313, + "learning_rate": 0.07646025169480959, + "loss": 0.0608, + "num_input_tokens_seen": 23994128, + "step": 26525 + }, + { + "epoch": 7.001847696977696, + "grad_norm": 0.002421510638669133, + "learning_rate": 0.07640891745376908, + "loss": 0.0241, + "num_input_tokens_seen": 23998960, + "step": 26530 + }, + { + "epoch": 7.003167480533192, + "grad_norm": 0.0009775101207196712, + "learning_rate": 0.07635759456139822, + "loss": 0.0276, + "num_input_tokens_seen": 24003440, + "step": 26535 + }, + { + "epoch": 7.00448726408869, + "grad_norm": 0.0015487836208194494, + "learning_rate": 0.0763062830256118, + "loss": 0.0303, + "num_input_tokens_seen": 24007728, + "step": 26540 + }, + { + "epoch": 7.005807047644186, + "grad_norm": 0.00036305878893472254, + "learning_rate": 0.07625498285432258, + "loss": 0.0221, + "num_input_tokens_seen": 24012368, + "step": 26545 + }, + { + "epoch": 7.007126831199683, + "grad_norm": 0.0035421857610344887, + "learning_rate": 0.07620369405544176, + "loss": 0.0536, + "num_input_tokens_seen": 24016528, + "step": 26550 + }, + { + "epoch": 7.00844661475518, + "grad_norm": 0.00017587447655387223, + "learning_rate": 0.07615241663687868, + "loss": 0.0337, + "num_input_tokens_seen": 24021136, + "step": 26555 + }, + { + "epoch": 7.009766398310677, + "grad_norm": 0.0011459741508588195, + "learning_rate": 0.07610115060654106, + "loss": 0.0287, + "num_input_tokens_seen": 24025616, + "step": 26560 + }, + { + "epoch": 7.011086181866174, + "grad_norm": 0.001611366868019104, + "learning_rate": 0.07604989597233458, + "loss": 0.0426, + "num_input_tokens_seen": 24029872, + "step": 26565 + }, + { + "epoch": 7.012405965421671, + "grad_norm": 0.0003495160781312734, + "learning_rate": 0.07599865274216352, + "loss": 0.0176, + "num_input_tokens_seen": 24034448, + "step": 26570 + }, + { + "epoch": 7.013725748977167, + "grad_norm": 0.0015696630580350757, + "learning_rate": 0.07594742092393013, + "loss": 0.0239, + "num_input_tokens_seen": 24038800, + "step": 26575 + }, + { + "epoch": 7.015045532532665, + "grad_norm": 0.0028369580395519733, + "learning_rate": 0.07589620052553503, + "loss": 0.0373, + "num_input_tokens_seen": 24043504, + "step": 26580 + }, + { + "epoch": 7.016365316088161, + "grad_norm": 0.0018523171311244369, + "learning_rate": 0.0758449915548771, + "loss": 0.0141, + "num_input_tokens_seen": 24048176, + "step": 26585 + }, + { + "epoch": 7.017685099643659, + "grad_norm": 0.0021274795290082693, + "learning_rate": 0.07579379401985332, + "loss": 0.0343, + "num_input_tokens_seen": 24052912, + "step": 26590 + }, + { + "epoch": 7.019004883199155, + "grad_norm": 0.0009079097071662545, + "learning_rate": 0.07574260792835905, + "loss": 0.0353, + "num_input_tokens_seen": 24057200, + "step": 26595 + }, + { + "epoch": 7.020324666754652, + "grad_norm": 0.0004158065130468458, + "learning_rate": 0.07569143328828784, + "loss": 0.0413, + "num_input_tokens_seen": 24061744, + "step": 26600 + }, + { + "epoch": 7.020324666754652, + "eval_loss": 0.08997534960508347, + "eval_runtime": 75.9167, + "eval_samples_per_second": 88.716, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 24061744, + "step": 26600 + }, + { + "epoch": 7.021644450310149, + "grad_norm": 0.001775218523107469, + "learning_rate": 0.0756402701075314, + "loss": 0.0197, + "num_input_tokens_seen": 24066384, + "step": 26605 + }, + { + "epoch": 7.022964233865646, + "grad_norm": 0.0011776033788919449, + "learning_rate": 0.07558911839397982, + "loss": 0.0271, + "num_input_tokens_seen": 24070672, + "step": 26610 + }, + { + "epoch": 7.0242840174211425, + "grad_norm": 0.0018632832216098905, + "learning_rate": 0.07553797815552123, + "loss": 0.041, + "num_input_tokens_seen": 24075344, + "step": 26615 + }, + { + "epoch": 7.02560380097664, + "grad_norm": 0.002867121482267976, + "learning_rate": 0.07548684940004222, + "loss": 0.0266, + "num_input_tokens_seen": 24079632, + "step": 26620 + }, + { + "epoch": 7.0269235845321365, + "grad_norm": 0.002389148809015751, + "learning_rate": 0.07543573213542744, + "loss": 0.0208, + "num_input_tokens_seen": 24084208, + "step": 26625 + }, + { + "epoch": 7.028243368087634, + "grad_norm": 0.0024083079770207405, + "learning_rate": 0.0753846263695597, + "loss": 0.0292, + "num_input_tokens_seen": 24088848, + "step": 26630 + }, + { + "epoch": 7.0295631516431305, + "grad_norm": 0.0015779027016833425, + "learning_rate": 0.07533353211032029, + "loss": 0.031, + "num_input_tokens_seen": 24093456, + "step": 26635 + }, + { + "epoch": 7.030882935198627, + "grad_norm": 0.00432176049798727, + "learning_rate": 0.07528244936558857, + "loss": 0.0327, + "num_input_tokens_seen": 24098096, + "step": 26640 + }, + { + "epoch": 7.0322027187541245, + "grad_norm": 0.0025957708712667227, + "learning_rate": 0.07523137814324206, + "loss": 0.0458, + "num_input_tokens_seen": 24102352, + "step": 26645 + }, + { + "epoch": 7.033522502309621, + "grad_norm": 0.002869841642677784, + "learning_rate": 0.07518031845115672, + "loss": 0.043, + "num_input_tokens_seen": 24106864, + "step": 26650 + }, + { + "epoch": 7.0348422858651185, + "grad_norm": 0.0003461731248535216, + "learning_rate": 0.07512927029720647, + "loss": 0.0198, + "num_input_tokens_seen": 24111280, + "step": 26655 + }, + { + "epoch": 7.036162069420615, + "grad_norm": 0.003162543987855315, + "learning_rate": 0.0750782336892636, + "loss": 0.0364, + "num_input_tokens_seen": 24115728, + "step": 26660 + }, + { + "epoch": 7.037481852976112, + "grad_norm": 0.001046111574396491, + "learning_rate": 0.0750272086351987, + "loss": 0.0309, + "num_input_tokens_seen": 24120272, + "step": 26665 + }, + { + "epoch": 7.038801636531609, + "grad_norm": 0.00036784858093596995, + "learning_rate": 0.07497619514288031, + "loss": 0.0142, + "num_input_tokens_seen": 24124720, + "step": 26670 + }, + { + "epoch": 7.040121420087106, + "grad_norm": 0.0006285954732447863, + "learning_rate": 0.07492519322017545, + "loss": 0.0445, + "num_input_tokens_seen": 24129392, + "step": 26675 + }, + { + "epoch": 7.041441203642602, + "grad_norm": 0.00011735918087651953, + "learning_rate": 0.0748742028749493, + "loss": 0.0397, + "num_input_tokens_seen": 24134192, + "step": 26680 + }, + { + "epoch": 7.0427609871981, + "grad_norm": 0.0012894441606476903, + "learning_rate": 0.0748232241150651, + "loss": 0.0536, + "num_input_tokens_seen": 24138704, + "step": 26685 + }, + { + "epoch": 7.044080770753596, + "grad_norm": 0.0007961641531437635, + "learning_rate": 0.07477225694838453, + "loss": 0.0319, + "num_input_tokens_seen": 24142960, + "step": 26690 + }, + { + "epoch": 7.045400554309094, + "grad_norm": 0.0048098452389240265, + "learning_rate": 0.07472130138276731, + "loss": 0.0515, + "num_input_tokens_seen": 24147376, + "step": 26695 + }, + { + "epoch": 7.04672033786459, + "grad_norm": 0.00029844610253348947, + "learning_rate": 0.07467035742607138, + "loss": 0.0138, + "num_input_tokens_seen": 24151696, + "step": 26700 + }, + { + "epoch": 7.048040121420087, + "grad_norm": 0.0028699482791125774, + "learning_rate": 0.07461942508615303, + "loss": 0.079, + "num_input_tokens_seen": 24156080, + "step": 26705 + }, + { + "epoch": 7.049359904975584, + "grad_norm": 0.0014384252717718482, + "learning_rate": 0.07456850437086657, + "loss": 0.0286, + "num_input_tokens_seen": 24160432, + "step": 26710 + }, + { + "epoch": 7.050679688531081, + "grad_norm": 0.0008005049312487245, + "learning_rate": 0.07451759528806468, + "loss": 0.0304, + "num_input_tokens_seen": 24164880, + "step": 26715 + }, + { + "epoch": 7.051999472086578, + "grad_norm": 0.0025159912183880806, + "learning_rate": 0.0744666978455982, + "loss": 0.0365, + "num_input_tokens_seen": 24169296, + "step": 26720 + }, + { + "epoch": 7.053319255642075, + "grad_norm": 0.0007182505214586854, + "learning_rate": 0.07441581205131609, + "loss": 0.0395, + "num_input_tokens_seen": 24173840, + "step": 26725 + }, + { + "epoch": 7.054639039197571, + "grad_norm": 0.002750784857198596, + "learning_rate": 0.07436493791306566, + "loss": 0.0394, + "num_input_tokens_seen": 24178320, + "step": 26730 + }, + { + "epoch": 7.055958822753069, + "grad_norm": 0.0002753825392574072, + "learning_rate": 0.07431407543869223, + "loss": 0.0378, + "num_input_tokens_seen": 24182992, + "step": 26735 + }, + { + "epoch": 7.057278606308565, + "grad_norm": 0.0005123476148582995, + "learning_rate": 0.0742632246360395, + "loss": 0.0108, + "num_input_tokens_seen": 24187344, + "step": 26740 + }, + { + "epoch": 7.058598389864063, + "grad_norm": 0.005444681737571955, + "learning_rate": 0.07421238551294934, + "loss": 0.0327, + "num_input_tokens_seen": 24191728, + "step": 26745 + }, + { + "epoch": 7.059918173419559, + "grad_norm": 0.003525279462337494, + "learning_rate": 0.07416155807726171, + "loss": 0.0338, + "num_input_tokens_seen": 24195728, + "step": 26750 + }, + { + "epoch": 7.061237956975056, + "grad_norm": 0.0016002849442884326, + "learning_rate": 0.07411074233681492, + "loss": 0.0439, + "num_input_tokens_seen": 24200336, + "step": 26755 + }, + { + "epoch": 7.062557740530553, + "grad_norm": 0.0019145117839798331, + "learning_rate": 0.07405993829944528, + "loss": 0.0362, + "num_input_tokens_seen": 24204752, + "step": 26760 + }, + { + "epoch": 7.06387752408605, + "grad_norm": 0.004803716205060482, + "learning_rate": 0.07400914597298755, + "loss": 0.0445, + "num_input_tokens_seen": 24209360, + "step": 26765 + }, + { + "epoch": 7.065197307641546, + "grad_norm": 0.001959898043423891, + "learning_rate": 0.07395836536527445, + "loss": 0.0294, + "num_input_tokens_seen": 24213904, + "step": 26770 + }, + { + "epoch": 7.066517091197044, + "grad_norm": 0.001149587333202362, + "learning_rate": 0.07390759648413696, + "loss": 0.0189, + "num_input_tokens_seen": 24218064, + "step": 26775 + }, + { + "epoch": 7.06783687475254, + "grad_norm": 0.0030017609242349863, + "learning_rate": 0.07385683933740435, + "loss": 0.0235, + "num_input_tokens_seen": 24222416, + "step": 26780 + }, + { + "epoch": 7.069156658308038, + "grad_norm": 0.0012285164557397366, + "learning_rate": 0.07380609393290402, + "loss": 0.0284, + "num_input_tokens_seen": 24227024, + "step": 26785 + }, + { + "epoch": 7.070476441863534, + "grad_norm": 0.0013311418006196618, + "learning_rate": 0.07375536027846147, + "loss": 0.033, + "num_input_tokens_seen": 24231504, + "step": 26790 + }, + { + "epoch": 7.071796225419031, + "grad_norm": 0.0010067974217236042, + "learning_rate": 0.07370463838190057, + "loss": 0.0299, + "num_input_tokens_seen": 24236208, + "step": 26795 + }, + { + "epoch": 7.073116008974528, + "grad_norm": 0.0018748781876638532, + "learning_rate": 0.07365392825104317, + "loss": 0.0474, + "num_input_tokens_seen": 24240720, + "step": 26800 + }, + { + "epoch": 7.073116008974528, + "eval_loss": 0.09719076752662659, + "eval_runtime": 75.9068, + "eval_samples_per_second": 88.727, + "eval_steps_per_second": 22.185, + "num_input_tokens_seen": 24240720, + "step": 26800 + }, + { + "epoch": 7.074435792530025, + "grad_norm": 0.0002149714418919757, + "learning_rate": 0.07360322989370945, + "loss": 0.0103, + "num_input_tokens_seen": 24245232, + "step": 26805 + }, + { + "epoch": 7.0757555760855215, + "grad_norm": 0.001220128033310175, + "learning_rate": 0.07355254331771781, + "loss": 0.0248, + "num_input_tokens_seen": 24249872, + "step": 26810 + }, + { + "epoch": 7.077075359641019, + "grad_norm": 0.002493513748049736, + "learning_rate": 0.07350186853088461, + "loss": 0.0388, + "num_input_tokens_seen": 24254672, + "step": 26815 + }, + { + "epoch": 7.0783951431965155, + "grad_norm": 0.00048247704398818314, + "learning_rate": 0.07345120554102462, + "loss": 0.0197, + "num_input_tokens_seen": 24259248, + "step": 26820 + }, + { + "epoch": 7.079714926752013, + "grad_norm": 0.0014814446913078427, + "learning_rate": 0.07340055435595079, + "loss": 0.0385, + "num_input_tokens_seen": 24263952, + "step": 26825 + }, + { + "epoch": 7.0810347103075095, + "grad_norm": 0.00274215592071414, + "learning_rate": 0.07334991498347401, + "loss": 0.021, + "num_input_tokens_seen": 24268368, + "step": 26830 + }, + { + "epoch": 7.082354493863006, + "grad_norm": 0.0029015198815613985, + "learning_rate": 0.07329928743140365, + "loss": 0.0204, + "num_input_tokens_seen": 24272944, + "step": 26835 + }, + { + "epoch": 7.0836742774185035, + "grad_norm": 0.0033548446372151375, + "learning_rate": 0.07324867170754705, + "loss": 0.0859, + "num_input_tokens_seen": 24277680, + "step": 26840 + }, + { + "epoch": 7.084994060974, + "grad_norm": 0.0034225271083414555, + "learning_rate": 0.07319806781970974, + "loss": 0.0593, + "num_input_tokens_seen": 24282256, + "step": 26845 + }, + { + "epoch": 7.0863138445294975, + "grad_norm": 0.000822693866211921, + "learning_rate": 0.07314747577569555, + "loss": 0.0338, + "num_input_tokens_seen": 24286480, + "step": 26850 + }, + { + "epoch": 7.087633628084994, + "grad_norm": 0.00027323354152031243, + "learning_rate": 0.07309689558330636, + "loss": 0.0112, + "num_input_tokens_seen": 24291152, + "step": 26855 + }, + { + "epoch": 7.088953411640491, + "grad_norm": 0.0027346687857061625, + "learning_rate": 0.0730463272503423, + "loss": 0.061, + "num_input_tokens_seen": 24295792, + "step": 26860 + }, + { + "epoch": 7.090273195195988, + "grad_norm": 0.003160706255584955, + "learning_rate": 0.07299577078460168, + "loss": 0.055, + "num_input_tokens_seen": 24300368, + "step": 26865 + }, + { + "epoch": 7.091592978751485, + "grad_norm": 0.0008625878836028278, + "learning_rate": 0.07294522619388083, + "loss": 0.0676, + "num_input_tokens_seen": 24304848, + "step": 26870 + }, + { + "epoch": 7.092912762306982, + "grad_norm": 0.0031733205541968346, + "learning_rate": 0.07289469348597452, + "loss": 0.0323, + "num_input_tokens_seen": 24309584, + "step": 26875 + }, + { + "epoch": 7.094232545862479, + "grad_norm": 0.0037439358420670033, + "learning_rate": 0.07284417266867535, + "loss": 0.0365, + "num_input_tokens_seen": 24313904, + "step": 26880 + }, + { + "epoch": 7.095552329417975, + "grad_norm": 0.002250345190986991, + "learning_rate": 0.07279366374977439, + "loss": 0.0104, + "num_input_tokens_seen": 24318512, + "step": 26885 + }, + { + "epoch": 7.096872112973473, + "grad_norm": 0.0019418989541009068, + "learning_rate": 0.07274316673706074, + "loss": 0.0377, + "num_input_tokens_seen": 24323056, + "step": 26890 + }, + { + "epoch": 7.098191896528969, + "grad_norm": 0.0013767321361228824, + "learning_rate": 0.07269268163832161, + "loss": 0.0608, + "num_input_tokens_seen": 24327536, + "step": 26895 + }, + { + "epoch": 7.099511680084466, + "grad_norm": 0.004502629395574331, + "learning_rate": 0.07264220846134248, + "loss": 0.0489, + "num_input_tokens_seen": 24331984, + "step": 26900 + }, + { + "epoch": 7.100831463639963, + "grad_norm": 0.0011884066043421626, + "learning_rate": 0.07259174721390699, + "loss": 0.0153, + "num_input_tokens_seen": 24336560, + "step": 26905 + }, + { + "epoch": 7.10215124719546, + "grad_norm": 0.002672799862921238, + "learning_rate": 0.07254129790379686, + "loss": 0.045, + "num_input_tokens_seen": 24340880, + "step": 26910 + }, + { + "epoch": 7.103471030750957, + "grad_norm": 0.0020808728877454996, + "learning_rate": 0.072490860538792, + "loss": 0.018, + "num_input_tokens_seen": 24345360, + "step": 26915 + }, + { + "epoch": 7.104790814306454, + "grad_norm": 0.0014107075985521078, + "learning_rate": 0.07244043512667042, + "loss": 0.0387, + "num_input_tokens_seen": 24350064, + "step": 26920 + }, + { + "epoch": 7.10611059786195, + "grad_norm": 0.0013718680711463094, + "learning_rate": 0.07239002167520843, + "loss": 0.0544, + "num_input_tokens_seen": 24354480, + "step": 26925 + }, + { + "epoch": 7.107430381417448, + "grad_norm": 0.002748737810179591, + "learning_rate": 0.07233962019218045, + "loss": 0.0177, + "num_input_tokens_seen": 24358896, + "step": 26930 + }, + { + "epoch": 7.108750164972944, + "grad_norm": 0.0031471555121243, + "learning_rate": 0.07228923068535892, + "loss": 0.0467, + "num_input_tokens_seen": 24363312, + "step": 26935 + }, + { + "epoch": 7.110069948528442, + "grad_norm": 0.0010870584519580007, + "learning_rate": 0.0722388531625146, + "loss": 0.0367, + "num_input_tokens_seen": 24367760, + "step": 26940 + }, + { + "epoch": 7.111389732083938, + "grad_norm": 0.0021739527583122253, + "learning_rate": 0.07218848763141639, + "loss": 0.0291, + "num_input_tokens_seen": 24372304, + "step": 26945 + }, + { + "epoch": 7.112709515639435, + "grad_norm": 0.002224267227575183, + "learning_rate": 0.07213813409983118, + "loss": 0.015, + "num_input_tokens_seen": 24377232, + "step": 26950 + }, + { + "epoch": 7.114029299194932, + "grad_norm": 0.0020882037933915854, + "learning_rate": 0.0720877925755242, + "loss": 0.0258, + "num_input_tokens_seen": 24381968, + "step": 26955 + }, + { + "epoch": 7.115349082750429, + "grad_norm": 0.0040739295072853565, + "learning_rate": 0.07203746306625866, + "loss": 0.0309, + "num_input_tokens_seen": 24386992, + "step": 26960 + }, + { + "epoch": 7.116668866305925, + "grad_norm": 0.0005062742275185883, + "learning_rate": 0.07198714557979606, + "loss": 0.0089, + "num_input_tokens_seen": 24391504, + "step": 26965 + }, + { + "epoch": 7.117988649861423, + "grad_norm": 0.0010745273903012276, + "learning_rate": 0.07193684012389602, + "loss": 0.0342, + "num_input_tokens_seen": 24396112, + "step": 26970 + }, + { + "epoch": 7.119308433416919, + "grad_norm": 0.0008000729139894247, + "learning_rate": 0.07188654670631621, + "loss": 0.0455, + "num_input_tokens_seen": 24400624, + "step": 26975 + }, + { + "epoch": 7.120628216972417, + "grad_norm": 0.0011125380406156182, + "learning_rate": 0.07183626533481258, + "loss": 0.029, + "num_input_tokens_seen": 24405264, + "step": 26980 + }, + { + "epoch": 7.121948000527913, + "grad_norm": 0.005635814741253853, + "learning_rate": 0.07178599601713909, + "loss": 0.0698, + "num_input_tokens_seen": 24409904, + "step": 26985 + }, + { + "epoch": 7.12326778408341, + "grad_norm": 0.004090440459549427, + "learning_rate": 0.07173573876104786, + "loss": 0.0476, + "num_input_tokens_seen": 24414512, + "step": 26990 + }, + { + "epoch": 7.124587567638907, + "grad_norm": 0.001152562559582293, + "learning_rate": 0.0716854935742893, + "loss": 0.036, + "num_input_tokens_seen": 24418928, + "step": 26995 + }, + { + "epoch": 7.125907351194404, + "grad_norm": 0.0022135707549750805, + "learning_rate": 0.07163526046461174, + "loss": 0.0218, + "num_input_tokens_seen": 24423344, + "step": 27000 + }, + { + "epoch": 7.125907351194404, + "eval_loss": 0.09540069103240967, + "eval_runtime": 75.9499, + "eval_samples_per_second": 88.677, + "eval_steps_per_second": 22.173, + "num_input_tokens_seen": 24423344, + "step": 27000 + }, + { + "epoch": 7.127227134749901, + "grad_norm": 0.0036078630946576595, + "learning_rate": 0.07158503943976181, + "loss": 0.0514, + "num_input_tokens_seen": 24427920, + "step": 27005 + }, + { + "epoch": 7.128546918305398, + "grad_norm": 0.004261850379407406, + "learning_rate": 0.07153483050748427, + "loss": 0.0353, + "num_input_tokens_seen": 24432272, + "step": 27010 + }, + { + "epoch": 7.1298667018608946, + "grad_norm": 0.00041211445932276547, + "learning_rate": 0.07148463367552188, + "loss": 0.0298, + "num_input_tokens_seen": 24436656, + "step": 27015 + }, + { + "epoch": 7.131186485416392, + "grad_norm": 0.006395851727575064, + "learning_rate": 0.07143444895161565, + "loss": 0.0398, + "num_input_tokens_seen": 24441104, + "step": 27020 + }, + { + "epoch": 7.1325062689718886, + "grad_norm": 0.0019192384788766503, + "learning_rate": 0.07138427634350476, + "loss": 0.0219, + "num_input_tokens_seen": 24445264, + "step": 27025 + }, + { + "epoch": 7.133826052527385, + "grad_norm": 0.0005994492094032466, + "learning_rate": 0.07133411585892636, + "loss": 0.0439, + "num_input_tokens_seen": 24449840, + "step": 27030 + }, + { + "epoch": 7.1351458360828826, + "grad_norm": 0.000387262727599591, + "learning_rate": 0.07128396750561593, + "loss": 0.0135, + "num_input_tokens_seen": 24454256, + "step": 27035 + }, + { + "epoch": 7.136465619638379, + "grad_norm": 0.001939240493811667, + "learning_rate": 0.07123383129130685, + "loss": 0.0349, + "num_input_tokens_seen": 24458736, + "step": 27040 + }, + { + "epoch": 7.1377854031938766, + "grad_norm": 0.0002093533257720992, + "learning_rate": 0.07118370722373084, + "loss": 0.0183, + "num_input_tokens_seen": 24463216, + "step": 27045 + }, + { + "epoch": 7.139105186749373, + "grad_norm": 0.0033651406411081553, + "learning_rate": 0.07113359531061769, + "loss": 0.0526, + "num_input_tokens_seen": 24467664, + "step": 27050 + }, + { + "epoch": 7.14042497030487, + "grad_norm": 0.0005112751387059689, + "learning_rate": 0.07108349555969525, + "loss": 0.0365, + "num_input_tokens_seen": 24472432, + "step": 27055 + }, + { + "epoch": 7.141744753860367, + "grad_norm": 0.0012578018940985203, + "learning_rate": 0.07103340797868944, + "loss": 0.0126, + "num_input_tokens_seen": 24476848, + "step": 27060 + }, + { + "epoch": 7.143064537415864, + "grad_norm": 0.003395787440240383, + "learning_rate": 0.07098333257532453, + "loss": 0.054, + "num_input_tokens_seen": 24481648, + "step": 27065 + }, + { + "epoch": 7.144384320971361, + "grad_norm": 0.0008309588301926851, + "learning_rate": 0.07093326935732269, + "loss": 0.0315, + "num_input_tokens_seen": 24486064, + "step": 27070 + }, + { + "epoch": 7.145704104526858, + "grad_norm": 0.0017842055531218648, + "learning_rate": 0.0708832183324044, + "loss": 0.0202, + "num_input_tokens_seen": 24490768, + "step": 27075 + }, + { + "epoch": 7.147023888082354, + "grad_norm": 0.0024569337256252766, + "learning_rate": 0.07083317950828799, + "loss": 0.0247, + "num_input_tokens_seen": 24495280, + "step": 27080 + }, + { + "epoch": 7.148343671637852, + "grad_norm": 0.0006912237731739879, + "learning_rate": 0.0707831528926902, + "loss": 0.0326, + "num_input_tokens_seen": 24499536, + "step": 27085 + }, + { + "epoch": 7.149663455193348, + "grad_norm": 0.00047139814705587924, + "learning_rate": 0.07073313849332578, + "loss": 0.0348, + "num_input_tokens_seen": 24504176, + "step": 27090 + }, + { + "epoch": 7.150983238748845, + "grad_norm": 0.0024766309652477503, + "learning_rate": 0.07068313631790749, + "loss": 0.0792, + "num_input_tokens_seen": 24508464, + "step": 27095 + }, + { + "epoch": 7.152303022304342, + "grad_norm": 0.0014131352072581649, + "learning_rate": 0.07063314637414632, + "loss": 0.0159, + "num_input_tokens_seen": 24513104, + "step": 27100 + }, + { + "epoch": 7.153622805859839, + "grad_norm": 0.001672928687185049, + "learning_rate": 0.07058316866975144, + "loss": 0.0551, + "num_input_tokens_seen": 24517712, + "step": 27105 + }, + { + "epoch": 7.154942589415336, + "grad_norm": 0.0031093116849660873, + "learning_rate": 0.0705332032124299, + "loss": 0.0599, + "num_input_tokens_seen": 24522192, + "step": 27110 + }, + { + "epoch": 7.156262372970833, + "grad_norm": 0.0007814282434992492, + "learning_rate": 0.0704832500098871, + "loss": 0.0192, + "num_input_tokens_seen": 24526800, + "step": 27115 + }, + { + "epoch": 7.157582156526329, + "grad_norm": 0.00569351390004158, + "learning_rate": 0.07043330906982641, + "loss": 0.0541, + "num_input_tokens_seen": 24531376, + "step": 27120 + }, + { + "epoch": 7.158901940081827, + "grad_norm": 0.006457604933530092, + "learning_rate": 0.07038338039994936, + "loss": 0.0487, + "num_input_tokens_seen": 24535824, + "step": 27125 + }, + { + "epoch": 7.160221723637323, + "grad_norm": 0.0028297933749854565, + "learning_rate": 0.07033346400795562, + "loss": 0.0338, + "num_input_tokens_seen": 24540304, + "step": 27130 + }, + { + "epoch": 7.161541507192821, + "grad_norm": 0.002002082532271743, + "learning_rate": 0.07028355990154282, + "loss": 0.0247, + "num_input_tokens_seen": 24545200, + "step": 27135 + }, + { + "epoch": 7.162861290748317, + "grad_norm": 0.002341208979487419, + "learning_rate": 0.07023366808840685, + "loss": 0.0338, + "num_input_tokens_seen": 24549424, + "step": 27140 + }, + { + "epoch": 7.164181074303814, + "grad_norm": 0.0021057582926005125, + "learning_rate": 0.07018378857624172, + "loss": 0.0338, + "num_input_tokens_seen": 24553968, + "step": 27145 + }, + { + "epoch": 7.165500857859311, + "grad_norm": 0.0009361207485198975, + "learning_rate": 0.0701339213727394, + "loss": 0.0106, + "num_input_tokens_seen": 24558000, + "step": 27150 + }, + { + "epoch": 7.166820641414808, + "grad_norm": 0.0005567215266637504, + "learning_rate": 0.07008406648559008, + "loss": 0.025, + "num_input_tokens_seen": 24562320, + "step": 27155 + }, + { + "epoch": 7.1681404249703045, + "grad_norm": 0.002413601614534855, + "learning_rate": 0.07003422392248196, + "loss": 0.0584, + "num_input_tokens_seen": 24566768, + "step": 27160 + }, + { + "epoch": 7.169460208525802, + "grad_norm": 0.001880260300822556, + "learning_rate": 0.06998439369110142, + "loss": 0.0099, + "num_input_tokens_seen": 24571376, + "step": 27165 + }, + { + "epoch": 7.1707799920812985, + "grad_norm": 0.0021660670172423124, + "learning_rate": 0.06993457579913295, + "loss": 0.0531, + "num_input_tokens_seen": 24576112, + "step": 27170 + }, + { + "epoch": 7.172099775636796, + "grad_norm": 0.0027689754497259855, + "learning_rate": 0.06988477025425903, + "loss": 0.0277, + "num_input_tokens_seen": 24580912, + "step": 27175 + }, + { + "epoch": 7.1734195591922925, + "grad_norm": 0.0007326871855184436, + "learning_rate": 0.06983497706416032, + "loss": 0.0203, + "num_input_tokens_seen": 24585328, + "step": 27180 + }, + { + "epoch": 7.174739342747789, + "grad_norm": 0.005778882652521133, + "learning_rate": 0.0697851962365156, + "loss": 0.0591, + "num_input_tokens_seen": 24589936, + "step": 27185 + }, + { + "epoch": 7.1760591263032865, + "grad_norm": 0.0012047074269503355, + "learning_rate": 0.06973542777900163, + "loss": 0.0375, + "num_input_tokens_seen": 24594000, + "step": 27190 + }, + { + "epoch": 7.177378909858783, + "grad_norm": 0.0013136083725839853, + "learning_rate": 0.06968567169929342, + "loss": 0.0603, + "num_input_tokens_seen": 24598608, + "step": 27195 + }, + { + "epoch": 7.1786986934142805, + "grad_norm": 0.0019146183039993048, + "learning_rate": 0.06963592800506392, + "loss": 0.0721, + "num_input_tokens_seen": 24603344, + "step": 27200 + }, + { + "epoch": 7.1786986934142805, + "eval_loss": 0.09255992621183395, + "eval_runtime": 75.8718, + "eval_samples_per_second": 88.768, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 24603344, + "step": 27200 + }, + { + "epoch": 7.180018476969777, + "grad_norm": 0.0024641624186187983, + "learning_rate": 0.06958619670398417, + "loss": 0.0249, + "num_input_tokens_seen": 24607888, + "step": 27205 + }, + { + "epoch": 7.181338260525274, + "grad_norm": 0.0038225525058805943, + "learning_rate": 0.0695364778037235, + "loss": 0.0395, + "num_input_tokens_seen": 24612080, + "step": 27210 + }, + { + "epoch": 7.182658044080771, + "grad_norm": 0.001573466812260449, + "learning_rate": 0.06948677131194907, + "loss": 0.0467, + "num_input_tokens_seen": 24616848, + "step": 27215 + }, + { + "epoch": 7.183977827636268, + "grad_norm": 0.0007114731706678867, + "learning_rate": 0.06943707723632629, + "loss": 0.0328, + "num_input_tokens_seen": 24621584, + "step": 27220 + }, + { + "epoch": 7.185297611191764, + "grad_norm": 0.003139874432235956, + "learning_rate": 0.06938739558451867, + "loss": 0.0213, + "num_input_tokens_seen": 24626256, + "step": 27225 + }, + { + "epoch": 7.186617394747262, + "grad_norm": 0.0009823748841881752, + "learning_rate": 0.06933772636418763, + "loss": 0.0221, + "num_input_tokens_seen": 24630832, + "step": 27230 + }, + { + "epoch": 7.187937178302758, + "grad_norm": 0.0028652155306190252, + "learning_rate": 0.06928806958299293, + "loss": 0.03, + "num_input_tokens_seen": 24635472, + "step": 27235 + }, + { + "epoch": 7.189256961858256, + "grad_norm": 0.00112786830868572, + "learning_rate": 0.06923842524859211, + "loss": 0.0512, + "num_input_tokens_seen": 24639824, + "step": 27240 + }, + { + "epoch": 7.190576745413752, + "grad_norm": 0.0015413234941661358, + "learning_rate": 0.06918879336864105, + "loss": 0.0582, + "num_input_tokens_seen": 24644304, + "step": 27245 + }, + { + "epoch": 7.191896528969249, + "grad_norm": 0.0017558628460392356, + "learning_rate": 0.06913917395079362, + "loss": 0.0331, + "num_input_tokens_seen": 24648944, + "step": 27250 + }, + { + "epoch": 7.193216312524746, + "grad_norm": 0.002351338043808937, + "learning_rate": 0.0690895670027017, + "loss": 0.0423, + "num_input_tokens_seen": 24653616, + "step": 27255 + }, + { + "epoch": 7.194536096080243, + "grad_norm": 0.00030776349012739956, + "learning_rate": 0.06903997253201531, + "loss": 0.0106, + "num_input_tokens_seen": 24658384, + "step": 27260 + }, + { + "epoch": 7.19585587963574, + "grad_norm": 0.000973102287389338, + "learning_rate": 0.06899039054638263, + "loss": 0.0631, + "num_input_tokens_seen": 24662768, + "step": 27265 + }, + { + "epoch": 7.197175663191237, + "grad_norm": 0.0008092315983958542, + "learning_rate": 0.06894082105344976, + "loss": 0.0295, + "num_input_tokens_seen": 24666992, + "step": 27270 + }, + { + "epoch": 7.198495446746733, + "grad_norm": 0.0038063018582761288, + "learning_rate": 0.06889126406086087, + "loss": 0.0514, + "num_input_tokens_seen": 24671344, + "step": 27275 + }, + { + "epoch": 7.199815230302231, + "grad_norm": 0.0019079557387158275, + "learning_rate": 0.0688417195762584, + "loss": 0.0489, + "num_input_tokens_seen": 24675856, + "step": 27280 + }, + { + "epoch": 7.201135013857727, + "grad_norm": 0.0021115154959261417, + "learning_rate": 0.06879218760728262, + "loss": 0.0372, + "num_input_tokens_seen": 24680432, + "step": 27285 + }, + { + "epoch": 7.202454797413224, + "grad_norm": 0.0024607244413346052, + "learning_rate": 0.06874266816157207, + "loss": 0.0337, + "num_input_tokens_seen": 24684720, + "step": 27290 + }, + { + "epoch": 7.203774580968721, + "grad_norm": 0.0005062593263573945, + "learning_rate": 0.06869316124676321, + "loss": 0.0236, + "num_input_tokens_seen": 24689008, + "step": 27295 + }, + { + "epoch": 7.205094364524218, + "grad_norm": 0.003226394299417734, + "learning_rate": 0.06864366687049062, + "loss": 0.0431, + "num_input_tokens_seen": 24693872, + "step": 27300 + }, + { + "epoch": 7.206414148079715, + "grad_norm": 0.0007575177005492151, + "learning_rate": 0.06859418504038704, + "loss": 0.0163, + "num_input_tokens_seen": 24698416, + "step": 27305 + }, + { + "epoch": 7.207733931635212, + "grad_norm": 0.0016315359389409423, + "learning_rate": 0.06854471576408311, + "loss": 0.0188, + "num_input_tokens_seen": 24703248, + "step": 27310 + }, + { + "epoch": 7.209053715190708, + "grad_norm": 0.0009984599892050028, + "learning_rate": 0.06849525904920767, + "loss": 0.0136, + "num_input_tokens_seen": 24707856, + "step": 27315 + }, + { + "epoch": 7.210373498746206, + "grad_norm": 0.0018433567602187395, + "learning_rate": 0.06844581490338748, + "loss": 0.0314, + "num_input_tokens_seen": 24712048, + "step": 27320 + }, + { + "epoch": 7.211693282301702, + "grad_norm": 0.0030766683630645275, + "learning_rate": 0.06839638333424752, + "loss": 0.0522, + "num_input_tokens_seen": 24716496, + "step": 27325 + }, + { + "epoch": 7.2130130658572, + "grad_norm": 0.00272299419157207, + "learning_rate": 0.06834696434941082, + "loss": 0.0289, + "num_input_tokens_seen": 24720848, + "step": 27330 + }, + { + "epoch": 7.214332849412696, + "grad_norm": 0.0022344994358718395, + "learning_rate": 0.06829755795649824, + "loss": 0.0457, + "num_input_tokens_seen": 24725360, + "step": 27335 + }, + { + "epoch": 7.215652632968193, + "grad_norm": 0.003233659081161022, + "learning_rate": 0.06824816416312904, + "loss": 0.0375, + "num_input_tokens_seen": 24729712, + "step": 27340 + }, + { + "epoch": 7.21697241652369, + "grad_norm": 0.0006167419487610459, + "learning_rate": 0.06819878297692027, + "loss": 0.0291, + "num_input_tokens_seen": 24734256, + "step": 27345 + }, + { + "epoch": 7.218292200079187, + "grad_norm": 0.0001568557054270059, + "learning_rate": 0.0681494144054871, + "loss": 0.0289, + "num_input_tokens_seen": 24739120, + "step": 27350 + }, + { + "epoch": 7.2196119836346835, + "grad_norm": 0.0009983641793951392, + "learning_rate": 0.06810005845644286, + "loss": 0.0404, + "num_input_tokens_seen": 24743472, + "step": 27355 + }, + { + "epoch": 7.220931767190181, + "grad_norm": 0.0024932657834142447, + "learning_rate": 0.06805071513739878, + "loss": 0.0467, + "num_input_tokens_seen": 24747888, + "step": 27360 + }, + { + "epoch": 7.2222515507456775, + "grad_norm": 0.0009728778968565166, + "learning_rate": 0.06800138445596428, + "loss": 0.0405, + "num_input_tokens_seen": 24752528, + "step": 27365 + }, + { + "epoch": 7.223571334301175, + "grad_norm": 0.0027736567426472902, + "learning_rate": 0.06795206641974678, + "loss": 0.014, + "num_input_tokens_seen": 24756976, + "step": 27370 + }, + { + "epoch": 7.2248911178566715, + "grad_norm": 0.0029779027681797743, + "learning_rate": 0.06790276103635169, + "loss": 0.0473, + "num_input_tokens_seen": 24761680, + "step": 27375 + }, + { + "epoch": 7.226210901412168, + "grad_norm": 0.005585158243775368, + "learning_rate": 0.0678534683133826, + "loss": 0.0287, + "num_input_tokens_seen": 24766512, + "step": 27380 + }, + { + "epoch": 7.2275306849676655, + "grad_norm": 0.0006981933256611228, + "learning_rate": 0.06780418825844095, + "loss": 0.0203, + "num_input_tokens_seen": 24771088, + "step": 27385 + }, + { + "epoch": 7.228850468523162, + "grad_norm": 0.0027620114851742983, + "learning_rate": 0.0677549208791264, + "loss": 0.0141, + "num_input_tokens_seen": 24775728, + "step": 27390 + }, + { + "epoch": 7.2301702520786595, + "grad_norm": 0.0005261931219138205, + "learning_rate": 0.06770566618303668, + "loss": 0.0231, + "num_input_tokens_seen": 24780240, + "step": 27395 + }, + { + "epoch": 7.231490035634156, + "grad_norm": 0.0014054578496143222, + "learning_rate": 0.06765642417776736, + "loss": 0.0302, + "num_input_tokens_seen": 24784688, + "step": 27400 + }, + { + "epoch": 7.231490035634156, + "eval_loss": 0.09664735943078995, + "eval_runtime": 75.9992, + "eval_samples_per_second": 88.619, + "eval_steps_per_second": 22.158, + "num_input_tokens_seen": 24784688, + "step": 27400 + }, + { + "epoch": 7.232809819189653, + "grad_norm": 0.004618827253580093, + "learning_rate": 0.0676071948709122, + "loss": 0.0524, + "num_input_tokens_seen": 24788912, + "step": 27405 + }, + { + "epoch": 7.23412960274515, + "grad_norm": 0.0036227181553840637, + "learning_rate": 0.06755797827006307, + "loss": 0.0544, + "num_input_tokens_seen": 24793584, + "step": 27410 + }, + { + "epoch": 7.235449386300647, + "grad_norm": 0.00021623726934194565, + "learning_rate": 0.06750877438280974, + "loss": 0.0221, + "num_input_tokens_seen": 24798000, + "step": 27415 + }, + { + "epoch": 7.236769169856144, + "grad_norm": 0.0005253521958366036, + "learning_rate": 0.06745958321673998, + "loss": 0.044, + "num_input_tokens_seen": 24802640, + "step": 27420 + }, + { + "epoch": 7.238088953411641, + "grad_norm": 0.0035198472905904055, + "learning_rate": 0.0674104047794398, + "loss": 0.0505, + "num_input_tokens_seen": 24806960, + "step": 27425 + }, + { + "epoch": 7.239408736967137, + "grad_norm": 0.004615556914359331, + "learning_rate": 0.06736123907849303, + "loss": 0.0489, + "num_input_tokens_seen": 24811216, + "step": 27430 + }, + { + "epoch": 7.240728520522635, + "grad_norm": 0.0031493024434894323, + "learning_rate": 0.06731208612148178, + "loss": 0.0531, + "num_input_tokens_seen": 24815792, + "step": 27435 + }, + { + "epoch": 7.242048304078131, + "grad_norm": 0.0016655725194141269, + "learning_rate": 0.0672629459159859, + "loss": 0.0428, + "num_input_tokens_seen": 24820176, + "step": 27440 + }, + { + "epoch": 7.243368087633628, + "grad_norm": 0.0011319261975586414, + "learning_rate": 0.0672138184695835, + "loss": 0.0315, + "num_input_tokens_seen": 24824624, + "step": 27445 + }, + { + "epoch": 7.244687871189125, + "grad_norm": 0.00017299568571615964, + "learning_rate": 0.0671647037898507, + "loss": 0.0152, + "num_input_tokens_seen": 24829296, + "step": 27450 + }, + { + "epoch": 7.246007654744622, + "grad_norm": 0.003840027842670679, + "learning_rate": 0.0671156018843615, + "loss": 0.0459, + "num_input_tokens_seen": 24833616, + "step": 27455 + }, + { + "epoch": 7.247327438300119, + "grad_norm": 0.001855796785093844, + "learning_rate": 0.06706651276068812, + "loss": 0.062, + "num_input_tokens_seen": 24838096, + "step": 27460 + }, + { + "epoch": 7.248647221855616, + "grad_norm": 0.00275518000125885, + "learning_rate": 0.06701743642640064, + "loss": 0.0575, + "num_input_tokens_seen": 24842416, + "step": 27465 + }, + { + "epoch": 7.249967005411112, + "grad_norm": 0.0006840745336376131, + "learning_rate": 0.06696837288906729, + "loss": 0.0582, + "num_input_tokens_seen": 24846896, + "step": 27470 + }, + { + "epoch": 7.25128678896661, + "grad_norm": 0.002844541799277067, + "learning_rate": 0.06691932215625432, + "loss": 0.0402, + "num_input_tokens_seen": 24851632, + "step": 27475 + }, + { + "epoch": 7.252606572522106, + "grad_norm": 0.00394688593223691, + "learning_rate": 0.06687028423552589, + "loss": 0.066, + "num_input_tokens_seen": 24856240, + "step": 27480 + }, + { + "epoch": 7.253926356077603, + "grad_norm": 0.0006485776975750923, + "learning_rate": 0.06682125913444435, + "loss": 0.0323, + "num_input_tokens_seen": 24860656, + "step": 27485 + }, + { + "epoch": 7.2552461396331, + "grad_norm": 0.0017951724585145712, + "learning_rate": 0.0667722468605699, + "loss": 0.0407, + "num_input_tokens_seen": 24864912, + "step": 27490 + }, + { + "epoch": 7.256565923188597, + "grad_norm": 0.00241304119117558, + "learning_rate": 0.06672324742146094, + "loss": 0.0512, + "num_input_tokens_seen": 24869392, + "step": 27495 + }, + { + "epoch": 7.257885706744094, + "grad_norm": 0.0008259782334789634, + "learning_rate": 0.06667426082467373, + "loss": 0.035, + "num_input_tokens_seen": 24873968, + "step": 27500 + }, + { + "epoch": 7.259205490299591, + "grad_norm": 0.0016253178473562002, + "learning_rate": 0.0666252870777626, + "loss": 0.0561, + "num_input_tokens_seen": 24878352, + "step": 27505 + }, + { + "epoch": 7.260525273855087, + "grad_norm": 0.001539089367724955, + "learning_rate": 0.06657632618827995, + "loss": 0.0238, + "num_input_tokens_seen": 24883248, + "step": 27510 + }, + { + "epoch": 7.261845057410585, + "grad_norm": 0.0019556679762899876, + "learning_rate": 0.06652737816377623, + "loss": 0.0448, + "num_input_tokens_seen": 24887696, + "step": 27515 + }, + { + "epoch": 7.263164840966081, + "grad_norm": 0.001331416191533208, + "learning_rate": 0.06647844301179971, + "loss": 0.0489, + "num_input_tokens_seen": 24892272, + "step": 27520 + }, + { + "epoch": 7.264484624521579, + "grad_norm": 0.0020418984349817038, + "learning_rate": 0.06642952073989689, + "loss": 0.05, + "num_input_tokens_seen": 24896816, + "step": 27525 + }, + { + "epoch": 7.265804408077075, + "grad_norm": 0.00455870945006609, + "learning_rate": 0.06638061135561223, + "loss": 0.037, + "num_input_tokens_seen": 24901520, + "step": 27530 + }, + { + "epoch": 7.267124191632572, + "grad_norm": 0.003826643107458949, + "learning_rate": 0.06633171486648808, + "loss": 0.0427, + "num_input_tokens_seen": 24905872, + "step": 27535 + }, + { + "epoch": 7.268443975188069, + "grad_norm": 0.0021649145055562258, + "learning_rate": 0.06628283128006499, + "loss": 0.0285, + "num_input_tokens_seen": 24910864, + "step": 27540 + }, + { + "epoch": 7.269763758743566, + "grad_norm": 0.0024526731576770544, + "learning_rate": 0.0662339606038813, + "loss": 0.0298, + "num_input_tokens_seen": 24915504, + "step": 27545 + }, + { + "epoch": 7.271083542299063, + "grad_norm": 0.002510253805667162, + "learning_rate": 0.06618510284547358, + "loss": 0.0544, + "num_input_tokens_seen": 24920240, + "step": 27550 + }, + { + "epoch": 7.27240332585456, + "grad_norm": 0.0020113878417760134, + "learning_rate": 0.06613625801237633, + "loss": 0.0285, + "num_input_tokens_seen": 24924592, + "step": 27555 + }, + { + "epoch": 7.2737231094100565, + "grad_norm": 0.004013075027614832, + "learning_rate": 0.066087426112122, + "loss": 0.0353, + "num_input_tokens_seen": 24928912, + "step": 27560 + }, + { + "epoch": 7.275042892965554, + "grad_norm": 0.0030513282399624586, + "learning_rate": 0.06603860715224101, + "loss": 0.0367, + "num_input_tokens_seen": 24933392, + "step": 27565 + }, + { + "epoch": 7.2763626765210505, + "grad_norm": 0.0003003497258760035, + "learning_rate": 0.06598980114026198, + "loss": 0.0272, + "num_input_tokens_seen": 24938064, + "step": 27570 + }, + { + "epoch": 7.277682460076547, + "grad_norm": 0.004301684908568859, + "learning_rate": 0.06594100808371128, + "loss": 0.0295, + "num_input_tokens_seen": 24942640, + "step": 27575 + }, + { + "epoch": 7.2790022436320445, + "grad_norm": 0.000827981682959944, + "learning_rate": 0.06589222799011357, + "loss": 0.0253, + "num_input_tokens_seen": 24947248, + "step": 27580 + }, + { + "epoch": 7.280322027187541, + "grad_norm": 0.005288782529532909, + "learning_rate": 0.0658434608669912, + "loss": 0.0775, + "num_input_tokens_seen": 24951792, + "step": 27585 + }, + { + "epoch": 7.2816418107430385, + "grad_norm": 0.001938143977895379, + "learning_rate": 0.06579470672186473, + "loss": 0.0489, + "num_input_tokens_seen": 24956208, + "step": 27590 + }, + { + "epoch": 7.282961594298535, + "grad_norm": 0.004684374667704105, + "learning_rate": 0.06574596556225275, + "loss": 0.0389, + "num_input_tokens_seen": 24960656, + "step": 27595 + }, + { + "epoch": 7.284281377854032, + "grad_norm": 0.0006131718400865793, + "learning_rate": 0.06569723739567161, + "loss": 0.0196, + "num_input_tokens_seen": 24965104, + "step": 27600 + }, + { + "epoch": 7.284281377854032, + "eval_loss": 0.09250389039516449, + "eval_runtime": 75.8016, + "eval_samples_per_second": 88.85, + "eval_steps_per_second": 22.216, + "num_input_tokens_seen": 24965104, + "step": 27600 + }, + { + "epoch": 7.285601161409529, + "grad_norm": 0.0016354175750166178, + "learning_rate": 0.06564852222963588, + "loss": 0.0495, + "num_input_tokens_seen": 24969744, + "step": 27605 + }, + { + "epoch": 7.286920944965026, + "grad_norm": 0.0003921812167391181, + "learning_rate": 0.06559982007165813, + "loss": 0.0188, + "num_input_tokens_seen": 24974160, + "step": 27610 + }, + { + "epoch": 7.288240728520522, + "grad_norm": 0.0036472538486123085, + "learning_rate": 0.06555113092924868, + "loss": 0.0251, + "num_input_tokens_seen": 24978672, + "step": 27615 + }, + { + "epoch": 7.28956051207602, + "grad_norm": 0.003246599342674017, + "learning_rate": 0.06550245480991615, + "loss": 0.0493, + "num_input_tokens_seen": 24982832, + "step": 27620 + }, + { + "epoch": 7.290880295631516, + "grad_norm": 0.0019748478662222624, + "learning_rate": 0.0654537917211669, + "loss": 0.0118, + "num_input_tokens_seen": 24986992, + "step": 27625 + }, + { + "epoch": 7.292200079187014, + "grad_norm": 0.0033290234860032797, + "learning_rate": 0.0654051416705055, + "loss": 0.034, + "num_input_tokens_seen": 24991472, + "step": 27630 + }, + { + "epoch": 7.29351986274251, + "grad_norm": 0.00427862536162138, + "learning_rate": 0.06535650466543427, + "loss": 0.0608, + "num_input_tokens_seen": 24996048, + "step": 27635 + }, + { + "epoch": 7.294839646298007, + "grad_norm": 0.003364290576428175, + "learning_rate": 0.0653078807134538, + "loss": 0.0281, + "num_input_tokens_seen": 25000624, + "step": 27640 + }, + { + "epoch": 7.296159429853504, + "grad_norm": 0.00036391697358340025, + "learning_rate": 0.06525926982206236, + "loss": 0.0505, + "num_input_tokens_seen": 25005200, + "step": 27645 + }, + { + "epoch": 7.297479213409001, + "grad_norm": 0.004208758007735014, + "learning_rate": 0.06521067199875648, + "loss": 0.0407, + "num_input_tokens_seen": 25009744, + "step": 27650 + }, + { + "epoch": 7.298798996964498, + "grad_norm": 9.87176172202453e-05, + "learning_rate": 0.06516208725103047, + "loss": 0.0507, + "num_input_tokens_seen": 25014352, + "step": 27655 + }, + { + "epoch": 7.300118780519995, + "grad_norm": 0.0009080751333385706, + "learning_rate": 0.06511351558637678, + "loss": 0.0251, + "num_input_tokens_seen": 25018544, + "step": 27660 + }, + { + "epoch": 7.301438564075491, + "grad_norm": 0.006807367317378521, + "learning_rate": 0.06506495701228569, + "loss": 0.0522, + "num_input_tokens_seen": 25022896, + "step": 27665 + }, + { + "epoch": 7.302758347630989, + "grad_norm": 0.0034854228142648935, + "learning_rate": 0.06501641153624559, + "loss": 0.0408, + "num_input_tokens_seen": 25027568, + "step": 27670 + }, + { + "epoch": 7.304078131186485, + "grad_norm": 0.0018937175627797842, + "learning_rate": 0.06496787916574286, + "loss": 0.0322, + "num_input_tokens_seen": 25031920, + "step": 27675 + }, + { + "epoch": 7.305397914741983, + "grad_norm": 0.001834789989516139, + "learning_rate": 0.06491935990826168, + "loss": 0.0312, + "num_input_tokens_seen": 25036624, + "step": 27680 + }, + { + "epoch": 7.306717698297479, + "grad_norm": 0.0033539156429469585, + "learning_rate": 0.0648708537712844, + "loss": 0.0408, + "num_input_tokens_seen": 25041360, + "step": 27685 + }, + { + "epoch": 7.308037481852976, + "grad_norm": 0.0005473645287565887, + "learning_rate": 0.06482236076229132, + "loss": 0.0294, + "num_input_tokens_seen": 25046256, + "step": 27690 + }, + { + "epoch": 7.309357265408473, + "grad_norm": 0.0023809594567865133, + "learning_rate": 0.06477388088876056, + "loss": 0.0353, + "num_input_tokens_seen": 25050800, + "step": 27695 + }, + { + "epoch": 7.31067704896397, + "grad_norm": 0.0023326724767684937, + "learning_rate": 0.06472541415816846, + "loss": 0.033, + "num_input_tokens_seen": 25055312, + "step": 27700 + }, + { + "epoch": 7.311996832519466, + "grad_norm": 0.0003371182538103312, + "learning_rate": 0.06467696057798909, + "loss": 0.0174, + "num_input_tokens_seen": 25059728, + "step": 27705 + }, + { + "epoch": 7.313316616074964, + "grad_norm": 0.00128548766952008, + "learning_rate": 0.0646285201556946, + "loss": 0.0102, + "num_input_tokens_seen": 25064368, + "step": 27710 + }, + { + "epoch": 7.31463639963046, + "grad_norm": 0.001683040289208293, + "learning_rate": 0.06458009289875521, + "loss": 0.0381, + "num_input_tokens_seen": 25068752, + "step": 27715 + }, + { + "epoch": 7.315956183185958, + "grad_norm": 0.0016750666545704007, + "learning_rate": 0.0645316788146389, + "loss": 0.044, + "num_input_tokens_seen": 25073200, + "step": 27720 + }, + { + "epoch": 7.317275966741454, + "grad_norm": 0.0035895006731152534, + "learning_rate": 0.06448327791081175, + "loss": 0.0405, + "num_input_tokens_seen": 25077776, + "step": 27725 + }, + { + "epoch": 7.318595750296951, + "grad_norm": 0.0025683599524199963, + "learning_rate": 0.0644348901947379, + "loss": 0.0395, + "num_input_tokens_seen": 25082224, + "step": 27730 + }, + { + "epoch": 7.319915533852448, + "grad_norm": 0.004981770645827055, + "learning_rate": 0.06438651567387917, + "loss": 0.027, + "num_input_tokens_seen": 25086864, + "step": 27735 + }, + { + "epoch": 7.321235317407945, + "grad_norm": 0.0012014104286208749, + "learning_rate": 0.0643381543556957, + "loss": 0.046, + "num_input_tokens_seen": 25091376, + "step": 27740 + }, + { + "epoch": 7.3225551009634415, + "grad_norm": 0.0010060188360512257, + "learning_rate": 0.06428980624764526, + "loss": 0.0305, + "num_input_tokens_seen": 25096176, + "step": 27745 + }, + { + "epoch": 7.323874884518939, + "grad_norm": 0.0008190117659978569, + "learning_rate": 0.06424147135718378, + "loss": 0.022, + "num_input_tokens_seen": 25100944, + "step": 27750 + }, + { + "epoch": 7.3251946680744355, + "grad_norm": 0.0021059904247522354, + "learning_rate": 0.06419314969176519, + "loss": 0.0373, + "num_input_tokens_seen": 25105520, + "step": 27755 + }, + { + "epoch": 7.326514451629933, + "grad_norm": 0.00256975251249969, + "learning_rate": 0.06414484125884118, + "loss": 0.0177, + "num_input_tokens_seen": 25110032, + "step": 27760 + }, + { + "epoch": 7.3278342351854295, + "grad_norm": 0.0032320749014616013, + "learning_rate": 0.06409654606586157, + "loss": 0.0311, + "num_input_tokens_seen": 25114672, + "step": 27765 + }, + { + "epoch": 7.329154018740926, + "grad_norm": 0.0007130192825570703, + "learning_rate": 0.06404826412027415, + "loss": 0.0319, + "num_input_tokens_seen": 25119344, + "step": 27770 + }, + { + "epoch": 7.3304738022964235, + "grad_norm": 0.0025610458105802536, + "learning_rate": 0.06399999542952453, + "loss": 0.0502, + "num_input_tokens_seen": 25123920, + "step": 27775 + }, + { + "epoch": 7.33179358585192, + "grad_norm": 0.0018608925165608525, + "learning_rate": 0.0639517400010563, + "loss": 0.0201, + "num_input_tokens_seen": 25128304, + "step": 27780 + }, + { + "epoch": 7.3331133694074175, + "grad_norm": 0.0018695378676056862, + "learning_rate": 0.06390349784231118, + "loss": 0.056, + "num_input_tokens_seen": 25132880, + "step": 27785 + }, + { + "epoch": 7.334433152962914, + "grad_norm": 0.0019606593996286392, + "learning_rate": 0.06385526896072859, + "loss": 0.0514, + "num_input_tokens_seen": 25137168, + "step": 27790 + }, + { + "epoch": 7.335752936518411, + "grad_norm": 0.0027101454325020313, + "learning_rate": 0.06380705336374613, + "loss": 0.0538, + "num_input_tokens_seen": 25141776, + "step": 27795 + }, + { + "epoch": 7.337072720073908, + "grad_norm": 0.005597172770649195, + "learning_rate": 0.06375885105879918, + "loss": 0.0322, + "num_input_tokens_seen": 25146416, + "step": 27800 + }, + { + "epoch": 7.337072720073908, + "eval_loss": 0.09312741458415985, + "eval_runtime": 75.8494, + "eval_samples_per_second": 88.794, + "eval_steps_per_second": 22.202, + "num_input_tokens_seen": 25146416, + "step": 27800 + }, + { + "epoch": 7.338392503629405, + "grad_norm": 0.000390716886613518, + "learning_rate": 0.06371066205332115, + "loss": 0.0412, + "num_input_tokens_seen": 25150704, + "step": 27805 + }, + { + "epoch": 7.339712287184902, + "grad_norm": 0.0005871201865375042, + "learning_rate": 0.06366248635474347, + "loss": 0.0318, + "num_input_tokens_seen": 25155056, + "step": 27810 + }, + { + "epoch": 7.341032070740399, + "grad_norm": 0.004504958167672157, + "learning_rate": 0.06361432397049532, + "loss": 0.0636, + "num_input_tokens_seen": 25159280, + "step": 27815 + }, + { + "epoch": 7.342351854295895, + "grad_norm": 0.0030060403514653444, + "learning_rate": 0.06356617490800408, + "loss": 0.0509, + "num_input_tokens_seen": 25164048, + "step": 27820 + }, + { + "epoch": 7.343671637851393, + "grad_norm": 0.004578581545501947, + "learning_rate": 0.06351803917469478, + "loss": 0.078, + "num_input_tokens_seen": 25168624, + "step": 27825 + }, + { + "epoch": 7.344991421406889, + "grad_norm": 0.0016718897968530655, + "learning_rate": 0.06346991677799067, + "loss": 0.0446, + "num_input_tokens_seen": 25173168, + "step": 27830 + }, + { + "epoch": 7.346311204962386, + "grad_norm": 0.0025976665783673525, + "learning_rate": 0.06342180772531283, + "loss": 0.0354, + "num_input_tokens_seen": 25177296, + "step": 27835 + }, + { + "epoch": 7.347630988517883, + "grad_norm": 0.0005011005559936166, + "learning_rate": 0.06337371202408021, + "loss": 0.0207, + "num_input_tokens_seen": 25182096, + "step": 27840 + }, + { + "epoch": 7.34895077207338, + "grad_norm": 0.003351494437083602, + "learning_rate": 0.06332562968170984, + "loss": 0.0324, + "num_input_tokens_seen": 25186800, + "step": 27845 + }, + { + "epoch": 7.350270555628877, + "grad_norm": 5.545180829358287e-05, + "learning_rate": 0.06327756070561656, + "loss": 0.0238, + "num_input_tokens_seen": 25191216, + "step": 27850 + }, + { + "epoch": 7.351590339184374, + "grad_norm": 0.0007587678846903145, + "learning_rate": 0.06322950510321329, + "loss": 0.0307, + "num_input_tokens_seen": 25195696, + "step": 27855 + }, + { + "epoch": 7.35291012273987, + "grad_norm": 0.0003540876496117562, + "learning_rate": 0.06318146288191076, + "loss": 0.0286, + "num_input_tokens_seen": 25199984, + "step": 27860 + }, + { + "epoch": 7.354229906295368, + "grad_norm": 0.00039925158489495516, + "learning_rate": 0.06313343404911763, + "loss": 0.0103, + "num_input_tokens_seen": 25204592, + "step": 27865 + }, + { + "epoch": 7.355549689850864, + "grad_norm": 0.0006822266732342541, + "learning_rate": 0.0630854186122406, + "loss": 0.0318, + "num_input_tokens_seen": 25209104, + "step": 27870 + }, + { + "epoch": 7.356869473406362, + "grad_norm": 0.0004650565970223397, + "learning_rate": 0.06303741657868431, + "loss": 0.0283, + "num_input_tokens_seen": 25213712, + "step": 27875 + }, + { + "epoch": 7.358189256961858, + "grad_norm": 0.002741435309872031, + "learning_rate": 0.06298942795585115, + "loss": 0.0249, + "num_input_tokens_seen": 25218352, + "step": 27880 + }, + { + "epoch": 7.359509040517355, + "grad_norm": 0.0026813179720193148, + "learning_rate": 0.06294145275114167, + "loss": 0.0265, + "num_input_tokens_seen": 25222960, + "step": 27885 + }, + { + "epoch": 7.360828824072852, + "grad_norm": 0.0007340034353546798, + "learning_rate": 0.06289349097195428, + "loss": 0.0482, + "num_input_tokens_seen": 25227440, + "step": 27890 + }, + { + "epoch": 7.362148607628349, + "grad_norm": 0.002557744737714529, + "learning_rate": 0.06284554262568516, + "loss": 0.0249, + "num_input_tokens_seen": 25232240, + "step": 27895 + }, + { + "epoch": 7.363468391183845, + "grad_norm": 0.000833411468192935, + "learning_rate": 0.06279760771972868, + "loss": 0.023, + "num_input_tokens_seen": 25236624, + "step": 27900 + }, + { + "epoch": 7.364788174739343, + "grad_norm": 0.0032354500144720078, + "learning_rate": 0.06274968626147688, + "loss": 0.0324, + "num_input_tokens_seen": 25241200, + "step": 27905 + }, + { + "epoch": 7.366107958294839, + "grad_norm": 0.003991740755736828, + "learning_rate": 0.06270177825831993, + "loss": 0.0472, + "num_input_tokens_seen": 25246064, + "step": 27910 + }, + { + "epoch": 7.367427741850337, + "grad_norm": 0.0011527601163834333, + "learning_rate": 0.06265388371764587, + "loss": 0.0259, + "num_input_tokens_seen": 25250800, + "step": 27915 + }, + { + "epoch": 7.368747525405833, + "grad_norm": 0.0010636388324201107, + "learning_rate": 0.0626060026468406, + "loss": 0.0128, + "num_input_tokens_seen": 25255312, + "step": 27920 + }, + { + "epoch": 7.37006730896133, + "grad_norm": 0.003147654701024294, + "learning_rate": 0.06255813505328794, + "loss": 0.028, + "num_input_tokens_seen": 25259664, + "step": 27925 + }, + { + "epoch": 7.371387092516827, + "grad_norm": 0.00459991954267025, + "learning_rate": 0.06251028094436978, + "loss": 0.0683, + "num_input_tokens_seen": 25264112, + "step": 27930 + }, + { + "epoch": 7.372706876072324, + "grad_norm": 0.0037815687246620655, + "learning_rate": 0.06246244032746568, + "loss": 0.0505, + "num_input_tokens_seen": 25268624, + "step": 27935 + }, + { + "epoch": 7.3740266596278214, + "grad_norm": 0.0021871428471058607, + "learning_rate": 0.06241461320995342, + "loss": 0.0193, + "num_input_tokens_seen": 25272944, + "step": 27940 + }, + { + "epoch": 7.375346443183318, + "grad_norm": 0.004633280914276838, + "learning_rate": 0.062366799599208426, + "loss": 0.0423, + "num_input_tokens_seen": 25277520, + "step": 27945 + }, + { + "epoch": 7.376666226738815, + "grad_norm": 0.0007193829515017569, + "learning_rate": 0.06231899950260418, + "loss": 0.0408, + "num_input_tokens_seen": 25282320, + "step": 27950 + }, + { + "epoch": 7.377986010294312, + "grad_norm": 5.026541475672275e-05, + "learning_rate": 0.06227121292751214, + "loss": 0.0571, + "num_input_tokens_seen": 25286768, + "step": 27955 + }, + { + "epoch": 7.379305793849809, + "grad_norm": 0.003180421655997634, + "learning_rate": 0.062223439881301496, + "loss": 0.0381, + "num_input_tokens_seen": 25291280, + "step": 27960 + }, + { + "epoch": 7.380625577405305, + "grad_norm": 0.0005414718762040138, + "learning_rate": 0.06217568037133948, + "loss": 0.0195, + "num_input_tokens_seen": 25296048, + "step": 27965 + }, + { + "epoch": 7.381945360960803, + "grad_norm": 0.001702082809060812, + "learning_rate": 0.06212793440499126, + "loss": 0.0246, + "num_input_tokens_seen": 25300432, + "step": 27970 + }, + { + "epoch": 7.383265144516299, + "grad_norm": 0.0046034445986151695, + "learning_rate": 0.062080201989619783, + "loss": 0.0193, + "num_input_tokens_seen": 25304720, + "step": 27975 + }, + { + "epoch": 7.384584928071797, + "grad_norm": 0.0021398644894361496, + "learning_rate": 0.062032483132586094, + "loss": 0.0253, + "num_input_tokens_seen": 25309296, + "step": 27980 + }, + { + "epoch": 7.385904711627293, + "grad_norm": 0.005807923153042793, + "learning_rate": 0.0619847778412489, + "loss": 0.0318, + "num_input_tokens_seen": 25313904, + "step": 27985 + }, + { + "epoch": 7.38722449518279, + "grad_norm": 0.0009135191794484854, + "learning_rate": 0.06193708612296509, + "loss": 0.0135, + "num_input_tokens_seen": 25318416, + "step": 27990 + }, + { + "epoch": 7.388544278738287, + "grad_norm": 0.0032332304399460554, + "learning_rate": 0.06188940798508923, + "loss": 0.0309, + "num_input_tokens_seen": 25323056, + "step": 27995 + }, + { + "epoch": 7.389864062293784, + "grad_norm": 0.00030858220998197794, + "learning_rate": 0.06184174343497397, + "loss": 0.0148, + "num_input_tokens_seen": 25327344, + "step": 28000 + }, + { + "epoch": 7.389864062293784, + "eval_loss": 0.09372107684612274, + "eval_runtime": 75.9247, + "eval_samples_per_second": 88.706, + "eval_steps_per_second": 22.18, + "num_input_tokens_seen": 25327344, + "step": 28000 + }, + { + "epoch": 7.391183845849281, + "grad_norm": 0.0032226156909018755, + "learning_rate": 0.061794092479969726, + "loss": 0.0699, + "num_input_tokens_seen": 25331696, + "step": 28005 + }, + { + "epoch": 7.392503629404778, + "grad_norm": 0.0027057749684900045, + "learning_rate": 0.06174645512742485, + "loss": 0.0647, + "num_input_tokens_seen": 25336304, + "step": 28010 + }, + { + "epoch": 7.393823412960274, + "grad_norm": 0.0034144108649343252, + "learning_rate": 0.06169883138468565, + "loss": 0.0513, + "num_input_tokens_seen": 25340784, + "step": 28015 + }, + { + "epoch": 7.395143196515772, + "grad_norm": 0.0009814281947910786, + "learning_rate": 0.06165122125909637, + "loss": 0.0355, + "num_input_tokens_seen": 25345488, + "step": 28020 + }, + { + "epoch": 7.396462980071268, + "grad_norm": 0.00015488546341657639, + "learning_rate": 0.061603624757998965, + "loss": 0.0254, + "num_input_tokens_seen": 25349872, + "step": 28025 + }, + { + "epoch": 7.397782763626765, + "grad_norm": 0.0056979157961905, + "learning_rate": 0.0615560418887335, + "loss": 0.1082, + "num_input_tokens_seen": 25354224, + "step": 28030 + }, + { + "epoch": 7.399102547182262, + "grad_norm": 0.0009665096295066178, + "learning_rate": 0.06150847265863787, + "loss": 0.0607, + "num_input_tokens_seen": 25358672, + "step": 28035 + }, + { + "epoch": 7.400422330737759, + "grad_norm": 0.00041628428152762353, + "learning_rate": 0.061460917075047757, + "loss": 0.0175, + "num_input_tokens_seen": 25363504, + "step": 28040 + }, + { + "epoch": 7.401742114293256, + "grad_norm": 0.002013427671045065, + "learning_rate": 0.06141337514529694, + "loss": 0.0151, + "num_input_tokens_seen": 25368016, + "step": 28045 + }, + { + "epoch": 7.403061897848753, + "grad_norm": 0.002598450519144535, + "learning_rate": 0.06136584687671687, + "loss": 0.0169, + "num_input_tokens_seen": 25372240, + "step": 28050 + }, + { + "epoch": 7.404381681404249, + "grad_norm": 0.0008741025230847299, + "learning_rate": 0.061318332276637064, + "loss": 0.0375, + "num_input_tokens_seen": 25376880, + "step": 28055 + }, + { + "epoch": 7.405701464959747, + "grad_norm": 0.000245287868892774, + "learning_rate": 0.06127083135238491, + "loss": 0.0266, + "num_input_tokens_seen": 25381392, + "step": 28060 + }, + { + "epoch": 7.407021248515243, + "grad_norm": 0.004424704238772392, + "learning_rate": 0.06122334411128555, + "loss": 0.0664, + "num_input_tokens_seen": 25385808, + "step": 28065 + }, + { + "epoch": 7.408341032070741, + "grad_norm": 0.0033068160992115736, + "learning_rate": 0.06117587056066223, + "loss": 0.0392, + "num_input_tokens_seen": 25389872, + "step": 28070 + }, + { + "epoch": 7.409660815626237, + "grad_norm": 0.004436929244548082, + "learning_rate": 0.06112841070783589, + "loss": 0.0159, + "num_input_tokens_seen": 25394192, + "step": 28075 + }, + { + "epoch": 7.410980599181734, + "grad_norm": 0.0020025556441396475, + "learning_rate": 0.061080964560125406, + "loss": 0.0447, + "num_input_tokens_seen": 25398608, + "step": 28080 + }, + { + "epoch": 7.412300382737231, + "grad_norm": 0.003971402999013662, + "learning_rate": 0.06103353212484766, + "loss": 0.0245, + "num_input_tokens_seen": 25402512, + "step": 28085 + }, + { + "epoch": 7.413620166292728, + "grad_norm": 0.001973762409761548, + "learning_rate": 0.06098611340931722, + "loss": 0.0263, + "num_input_tokens_seen": 25407056, + "step": 28090 + }, + { + "epoch": 7.414939949848225, + "grad_norm": 0.0022084282245486975, + "learning_rate": 0.06093870842084672, + "loss": 0.055, + "num_input_tokens_seen": 25411600, + "step": 28095 + }, + { + "epoch": 7.416259733403722, + "grad_norm": 0.0031093910802155733, + "learning_rate": 0.06089131716674666, + "loss": 0.0422, + "num_input_tokens_seen": 25416240, + "step": 28100 + }, + { + "epoch": 7.4175795169592185, + "grad_norm": 0.0009211789583787322, + "learning_rate": 0.060843939654325226, + "loss": 0.0319, + "num_input_tokens_seen": 25420848, + "step": 28105 + }, + { + "epoch": 7.418899300514716, + "grad_norm": 0.0005288800457492471, + "learning_rate": 0.06079657589088873, + "loss": 0.0314, + "num_input_tokens_seen": 25425488, + "step": 28110 + }, + { + "epoch": 7.4202190840702125, + "grad_norm": 0.0019438507733866572, + "learning_rate": 0.06074922588374126, + "loss": 0.1001, + "num_input_tokens_seen": 25429840, + "step": 28115 + }, + { + "epoch": 7.421538867625709, + "grad_norm": 0.0009008599445223808, + "learning_rate": 0.06070188964018472, + "loss": 0.0162, + "num_input_tokens_seen": 25434352, + "step": 28120 + }, + { + "epoch": 7.4228586511812065, + "grad_norm": 0.0017964455764740705, + "learning_rate": 0.06065456716751902, + "loss": 0.0336, + "num_input_tokens_seen": 25438800, + "step": 28125 + }, + { + "epoch": 7.424178434736703, + "grad_norm": 0.000805094197858125, + "learning_rate": 0.06060725847304182, + "loss": 0.0203, + "num_input_tokens_seen": 25443056, + "step": 28130 + }, + { + "epoch": 7.4254982182922005, + "grad_norm": 0.0025008711963891983, + "learning_rate": 0.06055996356404877, + "loss": 0.0373, + "num_input_tokens_seen": 25447344, + "step": 28135 + }, + { + "epoch": 7.426818001847697, + "grad_norm": 0.0011321678757667542, + "learning_rate": 0.06051268244783327, + "loss": 0.0503, + "num_input_tokens_seen": 25452208, + "step": 28140 + }, + { + "epoch": 7.428137785403194, + "grad_norm": 0.0013603982515633106, + "learning_rate": 0.06046541513168676, + "loss": 0.0398, + "num_input_tokens_seen": 25456944, + "step": 28145 + }, + { + "epoch": 7.429457568958691, + "grad_norm": 0.0016950148856267333, + "learning_rate": 0.060418161622898356, + "loss": 0.0304, + "num_input_tokens_seen": 25461680, + "step": 28150 + }, + { + "epoch": 7.430777352514188, + "grad_norm": 0.0027011376805603504, + "learning_rate": 0.06037092192875521, + "loss": 0.0294, + "num_input_tokens_seen": 25466288, + "step": 28155 + }, + { + "epoch": 7.432097136069684, + "grad_norm": 0.0027573092374950647, + "learning_rate": 0.060323696056542225, + "loss": 0.0415, + "num_input_tokens_seen": 25470960, + "step": 28160 + }, + { + "epoch": 7.433416919625182, + "grad_norm": 0.003673934144899249, + "learning_rate": 0.06027648401354229, + "loss": 0.029, + "num_input_tokens_seen": 25475472, + "step": 28165 + }, + { + "epoch": 7.434736703180678, + "grad_norm": 0.0014565922319889069, + "learning_rate": 0.06022928580703601, + "loss": 0.0675, + "num_input_tokens_seen": 25480208, + "step": 28170 + }, + { + "epoch": 7.436056486736176, + "grad_norm": 0.0010864316718652844, + "learning_rate": 0.060182101444301986, + "loss": 0.0209, + "num_input_tokens_seen": 25484592, + "step": 28175 + }, + { + "epoch": 7.437376270291672, + "grad_norm": 0.003631638130173087, + "learning_rate": 0.06013493093261669, + "loss": 0.0387, + "num_input_tokens_seen": 25489328, + "step": 28180 + }, + { + "epoch": 7.438696053847169, + "grad_norm": 0.003344567259773612, + "learning_rate": 0.06008777427925432, + "loss": 0.0395, + "num_input_tokens_seen": 25494000, + "step": 28185 + }, + { + "epoch": 7.440015837402666, + "grad_norm": 0.003695171792060137, + "learning_rate": 0.06004063149148705, + "loss": 0.0569, + "num_input_tokens_seen": 25498672, + "step": 28190 + }, + { + "epoch": 7.441335620958163, + "grad_norm": 0.00013615188072435558, + "learning_rate": 0.05999350257658497, + "loss": 0.0311, + "num_input_tokens_seen": 25503088, + "step": 28195 + }, + { + "epoch": 7.44265540451366, + "grad_norm": 0.0017449302831664681, + "learning_rate": 0.05994638754181582, + "loss": 0.0333, + "num_input_tokens_seen": 25507504, + "step": 28200 + }, + { + "epoch": 7.44265540451366, + "eval_loss": 0.09080032259225845, + "eval_runtime": 75.8841, + "eval_samples_per_second": 88.754, + "eval_steps_per_second": 22.192, + "num_input_tokens_seen": 25507504, + "step": 28200 + }, + { + "epoch": 7.443975188069157, + "grad_norm": 0.002951968926936388, + "learning_rate": 0.059899286394445445, + "loss": 0.0288, + "num_input_tokens_seen": 25511984, + "step": 28205 + }, + { + "epoch": 7.445294971624653, + "grad_norm": 0.0028305980376899242, + "learning_rate": 0.059852199141737346, + "loss": 0.0552, + "num_input_tokens_seen": 25517008, + "step": 28210 + }, + { + "epoch": 7.446614755180151, + "grad_norm": 0.0012510441010817885, + "learning_rate": 0.05980512579095304, + "loss": 0.0184, + "num_input_tokens_seen": 25521392, + "step": 28215 + }, + { + "epoch": 7.447934538735647, + "grad_norm": 0.0015291066374629736, + "learning_rate": 0.05975806634935181, + "loss": 0.0218, + "num_input_tokens_seen": 25526064, + "step": 28220 + }, + { + "epoch": 7.449254322291145, + "grad_norm": 0.002600622596219182, + "learning_rate": 0.05971102082419076, + "loss": 0.0342, + "num_input_tokens_seen": 25530448, + "step": 28225 + }, + { + "epoch": 7.450574105846641, + "grad_norm": 0.0017699471209198236, + "learning_rate": 0.05966398922272492, + "loss": 0.0371, + "num_input_tokens_seen": 25535216, + "step": 28230 + }, + { + "epoch": 7.451893889402138, + "grad_norm": 0.0048828162252902985, + "learning_rate": 0.059616971552207236, + "loss": 0.0338, + "num_input_tokens_seen": 25539824, + "step": 28235 + }, + { + "epoch": 7.453213672957635, + "grad_norm": 0.003864880185574293, + "learning_rate": 0.059569967819888305, + "loss": 0.0675, + "num_input_tokens_seen": 25544336, + "step": 28240 + }, + { + "epoch": 7.454533456513132, + "grad_norm": 0.0007373479893431067, + "learning_rate": 0.05952297803301681, + "loss": 0.057, + "num_input_tokens_seen": 25549008, + "step": 28245 + }, + { + "epoch": 7.455853240068628, + "grad_norm": 0.0030818767845630646, + "learning_rate": 0.059476002198839056, + "loss": 0.0473, + "num_input_tokens_seen": 25553552, + "step": 28250 + }, + { + "epoch": 7.457173023624126, + "grad_norm": 0.002647917717695236, + "learning_rate": 0.05942904032459935, + "loss": 0.0154, + "num_input_tokens_seen": 25557904, + "step": 28255 + }, + { + "epoch": 7.458492807179622, + "grad_norm": 0.0002988573396578431, + "learning_rate": 0.05938209241753987, + "loss": 0.0417, + "num_input_tokens_seen": 25562032, + "step": 28260 + }, + { + "epoch": 7.45981259073512, + "grad_norm": 0.0012823635479435325, + "learning_rate": 0.05933515848490046, + "loss": 0.0243, + "num_input_tokens_seen": 25566448, + "step": 28265 + }, + { + "epoch": 7.461132374290616, + "grad_norm": 0.002306265290826559, + "learning_rate": 0.059288238533918985, + "loss": 0.0321, + "num_input_tokens_seen": 25570992, + "step": 28270 + }, + { + "epoch": 7.462452157846113, + "grad_norm": 0.0014116051606833935, + "learning_rate": 0.05924133257183113, + "loss": 0.027, + "num_input_tokens_seen": 25575696, + "step": 28275 + }, + { + "epoch": 7.46377194140161, + "grad_norm": 0.002051963470876217, + "learning_rate": 0.059194440605870285, + "loss": 0.0352, + "num_input_tokens_seen": 25580208, + "step": 28280 + }, + { + "epoch": 7.465091724957107, + "grad_norm": 0.0018120892345905304, + "learning_rate": 0.059147562643267884, + "loss": 0.0101, + "num_input_tokens_seen": 25584496, + "step": 28285 + }, + { + "epoch": 7.4664115085126035, + "grad_norm": 0.0014129480114206672, + "learning_rate": 0.059100698691253055, + "loss": 0.0482, + "num_input_tokens_seen": 25588944, + "step": 28290 + }, + { + "epoch": 7.467731292068101, + "grad_norm": 0.0034159766510128975, + "learning_rate": 0.05905384875705273, + "loss": 0.0455, + "num_input_tokens_seen": 25593328, + "step": 28295 + }, + { + "epoch": 7.4690510756235975, + "grad_norm": 0.005476234946399927, + "learning_rate": 0.05900701284789189, + "loss": 0.0526, + "num_input_tokens_seen": 25597712, + "step": 28300 + }, + { + "epoch": 7.470370859179095, + "grad_norm": 0.001200484693981707, + "learning_rate": 0.058960190970993115, + "loss": 0.0167, + "num_input_tokens_seen": 25602352, + "step": 28305 + }, + { + "epoch": 7.4716906427345915, + "grad_norm": 0.0006209686398506165, + "learning_rate": 0.058913383133576955, + "loss": 0.0445, + "num_input_tokens_seen": 25606864, + "step": 28310 + }, + { + "epoch": 7.473010426290088, + "grad_norm": 0.0018233224982395768, + "learning_rate": 0.05886658934286185, + "loss": 0.0402, + "num_input_tokens_seen": 25611216, + "step": 28315 + }, + { + "epoch": 7.4743302098455855, + "grad_norm": 0.0010100190993398428, + "learning_rate": 0.058819809606063846, + "loss": 0.0375, + "num_input_tokens_seen": 25615728, + "step": 28320 + }, + { + "epoch": 7.475649993401082, + "grad_norm": 0.004279529210180044, + "learning_rate": 0.05877304393039711, + "loss": 0.0373, + "num_input_tokens_seen": 25620272, + "step": 28325 + }, + { + "epoch": 7.4769697769565795, + "grad_norm": 0.0029111464973539114, + "learning_rate": 0.05872629232307338, + "loss": 0.0385, + "num_input_tokens_seen": 25624880, + "step": 28330 + }, + { + "epoch": 7.478289560512076, + "grad_norm": 0.002246694639325142, + "learning_rate": 0.05867955479130239, + "loss": 0.046, + "num_input_tokens_seen": 25629424, + "step": 28335 + }, + { + "epoch": 7.479609344067573, + "grad_norm": 0.0008119007688947022, + "learning_rate": 0.058632831342291705, + "loss": 0.0559, + "num_input_tokens_seen": 25633840, + "step": 28340 + }, + { + "epoch": 7.48092912762307, + "grad_norm": 0.0037875091657042503, + "learning_rate": 0.05858612198324655, + "loss": 0.0357, + "num_input_tokens_seen": 25638288, + "step": 28345 + }, + { + "epoch": 7.482248911178567, + "grad_norm": 0.0030684410594403744, + "learning_rate": 0.05853942672137025, + "loss": 0.0445, + "num_input_tokens_seen": 25642672, + "step": 28350 + }, + { + "epoch": 7.483568694734064, + "grad_norm": 0.0029132484924048185, + "learning_rate": 0.05849274556386363, + "loss": 0.0796, + "num_input_tokens_seen": 25647376, + "step": 28355 + }, + { + "epoch": 7.484888478289561, + "grad_norm": 0.006166463252156973, + "learning_rate": 0.05844607851792567, + "loss": 0.0539, + "num_input_tokens_seen": 25652016, + "step": 28360 + }, + { + "epoch": 7.486208261845057, + "grad_norm": 0.001567178056575358, + "learning_rate": 0.058399425590752924, + "loss": 0.0176, + "num_input_tokens_seen": 25656464, + "step": 28365 + }, + { + "epoch": 7.487528045400555, + "grad_norm": 0.0020177781116217375, + "learning_rate": 0.05835278678953985, + "loss": 0.022, + "num_input_tokens_seen": 25660976, + "step": 28370 + }, + { + "epoch": 7.488847828956051, + "grad_norm": 0.0006417311378754675, + "learning_rate": 0.05830616212147874, + "loss": 0.0201, + "num_input_tokens_seen": 25665712, + "step": 28375 + }, + { + "epoch": 7.490167612511548, + "grad_norm": 0.000608511792961508, + "learning_rate": 0.058259551593759784, + "loss": 0.032, + "num_input_tokens_seen": 25670064, + "step": 28380 + }, + { + "epoch": 7.491487396067045, + "grad_norm": 0.0015310110757127404, + "learning_rate": 0.058212955213570804, + "loss": 0.0519, + "num_input_tokens_seen": 25674864, + "step": 28385 + }, + { + "epoch": 7.492807179622542, + "grad_norm": 0.0010592961916700006, + "learning_rate": 0.0581663729880976, + "loss": 0.0336, + "num_input_tokens_seen": 25679568, + "step": 28390 + }, + { + "epoch": 7.494126963178039, + "grad_norm": 0.0026545121800154448, + "learning_rate": 0.05811980492452379, + "loss": 0.0163, + "num_input_tokens_seen": 25683792, + "step": 28395 + }, + { + "epoch": 7.495446746733536, + "grad_norm": 0.002043864456936717, + "learning_rate": 0.058073251030030644, + "loss": 0.0388, + "num_input_tokens_seen": 25688464, + "step": 28400 + }, + { + "epoch": 7.495446746733536, + "eval_loss": 0.0940430536866188, + "eval_runtime": 75.8678, + "eval_samples_per_second": 88.773, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 25688464, + "step": 28400 + }, + { + "epoch": 7.496766530289032, + "grad_norm": 0.0010416604345664382, + "learning_rate": 0.05802671131179747, + "loss": 0.0333, + "num_input_tokens_seen": 25693200, + "step": 28405 + }, + { + "epoch": 7.49808631384453, + "grad_norm": 0.0005556776304729283, + "learning_rate": 0.057980185777001154, + "loss": 0.0607, + "num_input_tokens_seen": 25697808, + "step": 28410 + }, + { + "epoch": 7.499406097400026, + "grad_norm": 0.00223532528616488, + "learning_rate": 0.057933674432816606, + "loss": 0.0402, + "num_input_tokens_seen": 25702352, + "step": 28415 + }, + { + "epoch": 7.500725880955523, + "grad_norm": 0.001881006290204823, + "learning_rate": 0.05788717728641648, + "loss": 0.0385, + "num_input_tokens_seen": 25707088, + "step": 28420 + }, + { + "epoch": 7.50204566451102, + "grad_norm": 0.002121404279023409, + "learning_rate": 0.057840694344971126, + "loss": 0.028, + "num_input_tokens_seen": 25711856, + "step": 28425 + }, + { + "epoch": 7.503365448066517, + "grad_norm": 0.0035642622970044613, + "learning_rate": 0.0577942256156489, + "loss": 0.0639, + "num_input_tokens_seen": 25716528, + "step": 28430 + }, + { + "epoch": 7.504685231622014, + "grad_norm": 0.0015494102844968438, + "learning_rate": 0.057747771105615804, + "loss": 0.0337, + "num_input_tokens_seen": 25720912, + "step": 28435 + }, + { + "epoch": 7.506005015177511, + "grad_norm": 0.00176162738353014, + "learning_rate": 0.05770133082203568, + "loss": 0.0515, + "num_input_tokens_seen": 25725584, + "step": 28440 + }, + { + "epoch": 7.507324798733007, + "grad_norm": 0.005877459887415171, + "learning_rate": 0.0576549047720703, + "loss": 0.0395, + "num_input_tokens_seen": 25730256, + "step": 28445 + }, + { + "epoch": 7.508644582288505, + "grad_norm": 0.0010428220266476274, + "learning_rate": 0.05760849296287902, + "loss": 0.0315, + "num_input_tokens_seen": 25734928, + "step": 28450 + }, + { + "epoch": 7.509964365844001, + "grad_norm": 0.006170510780066252, + "learning_rate": 0.05756209540161919, + "loss": 0.0366, + "num_input_tokens_seen": 25739664, + "step": 28455 + }, + { + "epoch": 7.511284149399499, + "grad_norm": 0.0024148253723978996, + "learning_rate": 0.05751571209544595, + "loss": 0.0713, + "num_input_tokens_seen": 25744368, + "step": 28460 + }, + { + "epoch": 7.512603932954995, + "grad_norm": 0.0014920781832188368, + "learning_rate": 0.057469343051512085, + "loss": 0.0307, + "num_input_tokens_seen": 25749040, + "step": 28465 + }, + { + "epoch": 7.513923716510492, + "grad_norm": 0.001177690806798637, + "learning_rate": 0.057422988276968324, + "loss": 0.0284, + "num_input_tokens_seen": 25753552, + "step": 28470 + }, + { + "epoch": 7.515243500065989, + "grad_norm": 0.001468059839680791, + "learning_rate": 0.05737664777896323, + "loss": 0.0167, + "num_input_tokens_seen": 25757936, + "step": 28475 + }, + { + "epoch": 7.516563283621486, + "grad_norm": 0.00041610567132011056, + "learning_rate": 0.057330321564642975, + "loss": 0.0252, + "num_input_tokens_seen": 25762320, + "step": 28480 + }, + { + "epoch": 7.517883067176983, + "grad_norm": 0.0010810347739607096, + "learning_rate": 0.05728400964115174, + "loss": 0.0139, + "num_input_tokens_seen": 25767184, + "step": 28485 + }, + { + "epoch": 7.51920285073248, + "grad_norm": 0.0030060268472880125, + "learning_rate": 0.057237712015631305, + "loss": 0.0966, + "num_input_tokens_seen": 25771536, + "step": 28490 + }, + { + "epoch": 7.5205226342879765, + "grad_norm": 0.002286919392645359, + "learning_rate": 0.057191428695221425, + "loss": 0.024, + "num_input_tokens_seen": 25775888, + "step": 28495 + }, + { + "epoch": 7.521842417843474, + "grad_norm": 0.004542139358818531, + "learning_rate": 0.05714515968705958, + "loss": 0.0914, + "num_input_tokens_seen": 25780368, + "step": 28500 + }, + { + "epoch": 7.5231622013989705, + "grad_norm": 0.0005011843168176711, + "learning_rate": 0.05709890499828099, + "loss": 0.0257, + "num_input_tokens_seen": 25784880, + "step": 28505 + }, + { + "epoch": 7.524481984954468, + "grad_norm": 0.004789032973349094, + "learning_rate": 0.05705266463601868, + "loss": 0.0271, + "num_input_tokens_seen": 25789200, + "step": 28510 + }, + { + "epoch": 7.5258017685099645, + "grad_norm": 0.0007960696239024401, + "learning_rate": 0.057006438607403565, + "loss": 0.0212, + "num_input_tokens_seen": 25793488, + "step": 28515 + }, + { + "epoch": 7.527121552065461, + "grad_norm": 0.0021622038912028074, + "learning_rate": 0.056960226919564205, + "loss": 0.0354, + "num_input_tokens_seen": 25797936, + "step": 28520 + }, + { + "epoch": 7.5284413356209585, + "grad_norm": 0.0004957284545525908, + "learning_rate": 0.05691402957962713, + "loss": 0.0148, + "num_input_tokens_seen": 25802480, + "step": 28525 + }, + { + "epoch": 7.529761119176455, + "grad_norm": 0.0011741609778255224, + "learning_rate": 0.05686784659471642, + "loss": 0.036, + "num_input_tokens_seen": 25807184, + "step": 28530 + }, + { + "epoch": 7.531080902731952, + "grad_norm": 0.0035399547778069973, + "learning_rate": 0.056821677971954136, + "loss": 0.0288, + "num_input_tokens_seen": 25811472, + "step": 28535 + }, + { + "epoch": 7.532400686287449, + "grad_norm": 0.0003861708682961762, + "learning_rate": 0.05677552371846012, + "loss": 0.0463, + "num_input_tokens_seen": 25816272, + "step": 28540 + }, + { + "epoch": 7.533720469842946, + "grad_norm": 0.0027301248628646135, + "learning_rate": 0.05672938384135182, + "loss": 0.0225, + "num_input_tokens_seen": 25820720, + "step": 28545 + }, + { + "epoch": 7.535040253398442, + "grad_norm": 0.0023175731766968966, + "learning_rate": 0.05668325834774465, + "loss": 0.0156, + "num_input_tokens_seen": 25825104, + "step": 28550 + }, + { + "epoch": 7.53636003695394, + "grad_norm": 0.0010058473562821746, + "learning_rate": 0.05663714724475177, + "loss": 0.0537, + "num_input_tokens_seen": 25829648, + "step": 28555 + }, + { + "epoch": 7.537679820509436, + "grad_norm": 0.003590840846300125, + "learning_rate": 0.05659105053948403, + "loss": 0.0314, + "num_input_tokens_seen": 25834192, + "step": 28560 + }, + { + "epoch": 7.538999604064934, + "grad_norm": 0.00254211132414639, + "learning_rate": 0.056544968239050176, + "loss": 0.0474, + "num_input_tokens_seen": 25838608, + "step": 28565 + }, + { + "epoch": 7.54031938762043, + "grad_norm": 0.002190663944929838, + "learning_rate": 0.056498900350556616, + "loss": 0.0184, + "num_input_tokens_seen": 25843088, + "step": 28570 + }, + { + "epoch": 7.541639171175927, + "grad_norm": 0.0016079138731583953, + "learning_rate": 0.05645284688110766, + "loss": 0.0172, + "num_input_tokens_seen": 25847344, + "step": 28575 + }, + { + "epoch": 7.542958954731424, + "grad_norm": 0.004286430776119232, + "learning_rate": 0.05640680783780532, + "loss": 0.0333, + "num_input_tokens_seen": 25851920, + "step": 28580 + }, + { + "epoch": 7.544278738286921, + "grad_norm": 0.0041894917376339436, + "learning_rate": 0.056360783227749324, + "loss": 0.0306, + "num_input_tokens_seen": 25856464, + "step": 28585 + }, + { + "epoch": 7.545598521842418, + "grad_norm": 0.0026759086176753044, + "learning_rate": 0.05631477305803728, + "loss": 0.0637, + "num_input_tokens_seen": 25860752, + "step": 28590 + }, + { + "epoch": 7.546918305397915, + "grad_norm": 0.002334204502403736, + "learning_rate": 0.05626877733576462, + "loss": 0.0193, + "num_input_tokens_seen": 25865392, + "step": 28595 + }, + { + "epoch": 7.548238088953411, + "grad_norm": 0.0012701904634013772, + "learning_rate": 0.05622279606802435, + "loss": 0.0489, + "num_input_tokens_seen": 25870064, + "step": 28600 + }, + { + "epoch": 7.548238088953411, + "eval_loss": 0.09339945018291473, + "eval_runtime": 75.8262, + "eval_samples_per_second": 88.822, + "eval_steps_per_second": 22.209, + "num_input_tokens_seen": 25870064, + "step": 28600 + }, + { + "epoch": 7.549557872508909, + "grad_norm": 0.0018386519514024258, + "learning_rate": 0.05617682926190744, + "loss": 0.0329, + "num_input_tokens_seen": 25874640, + "step": 28605 + }, + { + "epoch": 7.550877656064405, + "grad_norm": 0.0037688324227929115, + "learning_rate": 0.05613087692450248, + "loss": 0.0237, + "num_input_tokens_seen": 25879024, + "step": 28610 + }, + { + "epoch": 7.552197439619903, + "grad_norm": 0.001583209610544145, + "learning_rate": 0.05608493906289592, + "loss": 0.0356, + "num_input_tokens_seen": 25883536, + "step": 28615 + }, + { + "epoch": 7.553517223175399, + "grad_norm": 0.0052640121430158615, + "learning_rate": 0.05603901568417201, + "loss": 0.0358, + "num_input_tokens_seen": 25888144, + "step": 28620 + }, + { + "epoch": 7.554837006730896, + "grad_norm": 0.004681549966335297, + "learning_rate": 0.055993106795412625, + "loss": 0.0438, + "num_input_tokens_seen": 25892880, + "step": 28625 + }, + { + "epoch": 7.556156790286393, + "grad_norm": 0.001707741292193532, + "learning_rate": 0.05594721240369759, + "loss": 0.0729, + "num_input_tokens_seen": 25897424, + "step": 28630 + }, + { + "epoch": 7.55747657384189, + "grad_norm": 0.002560376189649105, + "learning_rate": 0.055901332516104296, + "loss": 0.0446, + "num_input_tokens_seen": 25902160, + "step": 28635 + }, + { + "epoch": 7.558796357397387, + "grad_norm": 0.0020105966832488775, + "learning_rate": 0.05585546713970804, + "loss": 0.0816, + "num_input_tokens_seen": 25906480, + "step": 28640 + }, + { + "epoch": 7.560116140952884, + "grad_norm": 0.0020436663180589676, + "learning_rate": 0.05580961628158189, + "loss": 0.0229, + "num_input_tokens_seen": 25910960, + "step": 28645 + }, + { + "epoch": 7.56143592450838, + "grad_norm": 0.0009607744286768138, + "learning_rate": 0.05576377994879659, + "loss": 0.0298, + "num_input_tokens_seen": 25915504, + "step": 28650 + }, + { + "epoch": 7.562755708063878, + "grad_norm": 0.00019949005218222737, + "learning_rate": 0.05571795814842063, + "loss": 0.0307, + "num_input_tokens_seen": 25919984, + "step": 28655 + }, + { + "epoch": 7.564075491619374, + "grad_norm": 0.003017866052687168, + "learning_rate": 0.05567215088752037, + "loss": 0.0409, + "num_input_tokens_seen": 25924240, + "step": 28660 + }, + { + "epoch": 7.565395275174871, + "grad_norm": 0.0028868364170193672, + "learning_rate": 0.05562635817315981, + "loss": 0.0442, + "num_input_tokens_seen": 25928560, + "step": 28665 + }, + { + "epoch": 7.566715058730368, + "grad_norm": 0.0014039970701560378, + "learning_rate": 0.05558058001240083, + "loss": 0.0518, + "num_input_tokens_seen": 25933104, + "step": 28670 + }, + { + "epoch": 7.568034842285865, + "grad_norm": 0.0020108448807150126, + "learning_rate": 0.055534816412302915, + "loss": 0.0256, + "num_input_tokens_seen": 25937840, + "step": 28675 + }, + { + "epoch": 7.5693546258413615, + "grad_norm": 0.0017918497323989868, + "learning_rate": 0.055489067379923436, + "loss": 0.024, + "num_input_tokens_seen": 25942608, + "step": 28680 + }, + { + "epoch": 7.570674409396859, + "grad_norm": 0.0023928103037178516, + "learning_rate": 0.055443332922317505, + "loss": 0.0247, + "num_input_tokens_seen": 25947344, + "step": 28685 + }, + { + "epoch": 7.5719941929523555, + "grad_norm": 0.002953513525426388, + "learning_rate": 0.055397613046537876, + "loss": 0.0285, + "num_input_tokens_seen": 25951984, + "step": 28690 + }, + { + "epoch": 7.573313976507853, + "grad_norm": 0.0017222145106643438, + "learning_rate": 0.055351907759635145, + "loss": 0.0566, + "num_input_tokens_seen": 25956400, + "step": 28695 + }, + { + "epoch": 7.5746337600633495, + "grad_norm": 0.0012337570078670979, + "learning_rate": 0.05530621706865772, + "loss": 0.0575, + "num_input_tokens_seen": 25960720, + "step": 28700 + }, + { + "epoch": 7.575953543618846, + "grad_norm": 0.002428096253424883, + "learning_rate": 0.055260540980651564, + "loss": 0.0421, + "num_input_tokens_seen": 25964976, + "step": 28705 + }, + { + "epoch": 7.5772733271743435, + "grad_norm": 0.00037512672133743763, + "learning_rate": 0.05521487950266062, + "loss": 0.0494, + "num_input_tokens_seen": 25969360, + "step": 28710 + }, + { + "epoch": 7.57859311072984, + "grad_norm": 0.00047258875565603375, + "learning_rate": 0.055169232641726344, + "loss": 0.0361, + "num_input_tokens_seen": 25974192, + "step": 28715 + }, + { + "epoch": 7.5799128942853375, + "grad_norm": 0.005593184847384691, + "learning_rate": 0.055123600404888166, + "loss": 0.0549, + "num_input_tokens_seen": 25978928, + "step": 28720 + }, + { + "epoch": 7.581232677840834, + "grad_norm": 0.000875685247592628, + "learning_rate": 0.05507798279918309, + "loss": 0.0172, + "num_input_tokens_seen": 25983344, + "step": 28725 + }, + { + "epoch": 7.582552461396331, + "grad_norm": 0.0014325306983664632, + "learning_rate": 0.0550323798316459, + "loss": 0.0431, + "num_input_tokens_seen": 25988176, + "step": 28730 + }, + { + "epoch": 7.583872244951828, + "grad_norm": 0.005240359343588352, + "learning_rate": 0.05498679150930916, + "loss": 0.0791, + "num_input_tokens_seen": 25992752, + "step": 28735 + }, + { + "epoch": 7.585192028507325, + "grad_norm": 0.0007540370570495725, + "learning_rate": 0.05494121783920323, + "loss": 0.0199, + "num_input_tokens_seen": 25997104, + "step": 28740 + }, + { + "epoch": 7.586511812062822, + "grad_norm": 0.0008250616956502199, + "learning_rate": 0.05489565882835605, + "loss": 0.0314, + "num_input_tokens_seen": 26001840, + "step": 28745 + }, + { + "epoch": 7.587831595618319, + "grad_norm": 0.0010179831879213452, + "learning_rate": 0.05485011448379348, + "loss": 0.047, + "num_input_tokens_seen": 26006736, + "step": 28750 + }, + { + "epoch": 7.589151379173815, + "grad_norm": 0.0014411706943064928, + "learning_rate": 0.05480458481253893, + "loss": 0.0349, + "num_input_tokens_seen": 26011056, + "step": 28755 + }, + { + "epoch": 7.590471162729313, + "grad_norm": 0.000645918189547956, + "learning_rate": 0.054759069821613715, + "loss": 0.0301, + "num_input_tokens_seen": 26015568, + "step": 28760 + }, + { + "epoch": 7.591790946284809, + "grad_norm": 0.004234005231410265, + "learning_rate": 0.05471356951803683, + "loss": 0.0663, + "num_input_tokens_seen": 26020432, + "step": 28765 + }, + { + "epoch": 7.593110729840307, + "grad_norm": 0.001120059983804822, + "learning_rate": 0.054668083908824945, + "loss": 0.0402, + "num_input_tokens_seen": 26025264, + "step": 28770 + }, + { + "epoch": 7.594430513395803, + "grad_norm": 0.0007190403412096202, + "learning_rate": 0.054622613000992526, + "loss": 0.0313, + "num_input_tokens_seen": 26029520, + "step": 28775 + }, + { + "epoch": 7.5957502969513, + "grad_norm": 0.0008198755094781518, + "learning_rate": 0.05457715680155182, + "loss": 0.0321, + "num_input_tokens_seen": 26033968, + "step": 28780 + }, + { + "epoch": 7.597070080506797, + "grad_norm": 0.0014692882541567087, + "learning_rate": 0.05453171531751265, + "loss": 0.0223, + "num_input_tokens_seen": 26038352, + "step": 28785 + }, + { + "epoch": 7.598389864062294, + "grad_norm": 0.0009297979413531721, + "learning_rate": 0.05448628855588276, + "loss": 0.0187, + "num_input_tokens_seen": 26042736, + "step": 28790 + }, + { + "epoch": 7.59970964761779, + "grad_norm": 0.0025990952271968126, + "learning_rate": 0.05444087652366746, + "loss": 0.0374, + "num_input_tokens_seen": 26047184, + "step": 28795 + }, + { + "epoch": 7.601029431173288, + "grad_norm": 0.0011947562452405691, + "learning_rate": 0.05439547922786984, + "loss": 0.0296, + "num_input_tokens_seen": 26051856, + "step": 28800 + }, + { + "epoch": 7.601029431173288, + "eval_loss": 0.09339552372694016, + "eval_runtime": 75.9275, + "eval_samples_per_second": 88.703, + "eval_steps_per_second": 22.179, + "num_input_tokens_seen": 26051856, + "step": 28800 + }, + { + "epoch": 7.602349214728784, + "grad_norm": 0.00033560782321728766, + "learning_rate": 0.0543500966754908, + "loss": 0.0342, + "num_input_tokens_seen": 26056368, + "step": 28805 + }, + { + "epoch": 7.603668998284281, + "grad_norm": 0.0019349167123436928, + "learning_rate": 0.05430472887352882, + "loss": 0.0534, + "num_input_tokens_seen": 26060848, + "step": 28810 + }, + { + "epoch": 7.604988781839778, + "grad_norm": 0.004137837328016758, + "learning_rate": 0.05425937582898023, + "loss": 0.0381, + "num_input_tokens_seen": 26065520, + "step": 28815 + }, + { + "epoch": 7.606308565395275, + "grad_norm": 0.0019856246653944254, + "learning_rate": 0.054214037548839085, + "loss": 0.0545, + "num_input_tokens_seen": 26069648, + "step": 28820 + }, + { + "epoch": 7.607628348950772, + "grad_norm": 0.003697293810546398, + "learning_rate": 0.05416871404009703, + "loss": 0.0733, + "num_input_tokens_seen": 26074320, + "step": 28825 + }, + { + "epoch": 7.608948132506269, + "grad_norm": 0.002302306005731225, + "learning_rate": 0.054123405309743605, + "loss": 0.0254, + "num_input_tokens_seen": 26078928, + "step": 28830 + }, + { + "epoch": 7.6102679160617654, + "grad_norm": 0.0010295659303665161, + "learning_rate": 0.0540781113647659, + "loss": 0.0262, + "num_input_tokens_seen": 26083536, + "step": 28835 + }, + { + "epoch": 7.611587699617263, + "grad_norm": 0.0018148390809074044, + "learning_rate": 0.054032832212148836, + "loss": 0.0233, + "num_input_tokens_seen": 26088112, + "step": 28840 + }, + { + "epoch": 7.6129074831727594, + "grad_norm": 0.003283816622570157, + "learning_rate": 0.0539875678588751, + "loss": 0.0442, + "num_input_tokens_seen": 26092336, + "step": 28845 + }, + { + "epoch": 7.614227266728257, + "grad_norm": 0.0025312937796115875, + "learning_rate": 0.05394231831192492, + "loss": 0.0338, + "num_input_tokens_seen": 26097072, + "step": 28850 + }, + { + "epoch": 7.6155470502837534, + "grad_norm": 0.0018360092071816325, + "learning_rate": 0.05389708357827639, + "loss": 0.0214, + "num_input_tokens_seen": 26101840, + "step": 28855 + }, + { + "epoch": 7.61686683383925, + "grad_norm": 0.0026999053079634905, + "learning_rate": 0.05385186366490533, + "loss": 0.0325, + "num_input_tokens_seen": 26106224, + "step": 28860 + }, + { + "epoch": 7.6181866173947475, + "grad_norm": 0.0011985193705186248, + "learning_rate": 0.053806658578785166, + "loss": 0.0507, + "num_input_tokens_seen": 26110416, + "step": 28865 + }, + { + "epoch": 7.619506400950244, + "grad_norm": 0.0010581972310319543, + "learning_rate": 0.05376146832688705, + "loss": 0.0279, + "num_input_tokens_seen": 26114800, + "step": 28870 + }, + { + "epoch": 7.6208261845057415, + "grad_norm": 0.0030686899553984404, + "learning_rate": 0.053716292916179964, + "loss": 0.0288, + "num_input_tokens_seen": 26119024, + "step": 28875 + }, + { + "epoch": 7.622145968061238, + "grad_norm": 0.0010520971845835447, + "learning_rate": 0.05367113235363045, + "loss": 0.0449, + "num_input_tokens_seen": 26123600, + "step": 28880 + }, + { + "epoch": 7.623465751616735, + "grad_norm": 0.0021181758493185043, + "learning_rate": 0.05362598664620289, + "loss": 0.028, + "num_input_tokens_seen": 26127760, + "step": 28885 + }, + { + "epoch": 7.624785535172232, + "grad_norm": 0.0017668022774159908, + "learning_rate": 0.053580855800859285, + "loss": 0.0394, + "num_input_tokens_seen": 26131952, + "step": 28890 + }, + { + "epoch": 7.626105318727729, + "grad_norm": 0.0017461462412029505, + "learning_rate": 0.05353573982455938, + "loss": 0.0315, + "num_input_tokens_seen": 26136336, + "step": 28895 + }, + { + "epoch": 7.627425102283226, + "grad_norm": 0.0013945099199190736, + "learning_rate": 0.053490638724260686, + "loss": 0.022, + "num_input_tokens_seen": 26140816, + "step": 28900 + }, + { + "epoch": 7.628744885838723, + "grad_norm": 0.003795139491558075, + "learning_rate": 0.05344555250691827, + "loss": 0.1016, + "num_input_tokens_seen": 26145488, + "step": 28905 + }, + { + "epoch": 7.630064669394219, + "grad_norm": 0.0035267581697553396, + "learning_rate": 0.053400481179485086, + "loss": 0.0273, + "num_input_tokens_seen": 26149904, + "step": 28910 + }, + { + "epoch": 7.631384452949717, + "grad_norm": 0.0024818910751491785, + "learning_rate": 0.05335542474891159, + "loss": 0.0296, + "num_input_tokens_seen": 26154544, + "step": 28915 + }, + { + "epoch": 7.632704236505213, + "grad_norm": 0.002047381130978465, + "learning_rate": 0.053310383222146124, + "loss": 0.0245, + "num_input_tokens_seen": 26158928, + "step": 28920 + }, + { + "epoch": 7.63402402006071, + "grad_norm": 0.002215816406533122, + "learning_rate": 0.053265356606134684, + "loss": 0.0265, + "num_input_tokens_seen": 26163632, + "step": 28925 + }, + { + "epoch": 7.635343803616207, + "grad_norm": 0.00011570378410397097, + "learning_rate": 0.053220344907820856, + "loss": 0.026, + "num_input_tokens_seen": 26168144, + "step": 28930 + }, + { + "epoch": 7.636663587171704, + "grad_norm": 0.0032809649128466845, + "learning_rate": 0.05317534813414608, + "loss": 0.0484, + "num_input_tokens_seen": 26172432, + "step": 28935 + }, + { + "epoch": 7.637983370727201, + "grad_norm": 0.0006485410849563777, + "learning_rate": 0.05313036629204942, + "loss": 0.0291, + "num_input_tokens_seen": 26176752, + "step": 28940 + }, + { + "epoch": 7.639303154282698, + "grad_norm": 0.005022912286221981, + "learning_rate": 0.05308539938846756, + "loss": 0.0626, + "num_input_tokens_seen": 26181488, + "step": 28945 + }, + { + "epoch": 7.640622937838194, + "grad_norm": 0.00045953699736855924, + "learning_rate": 0.05304044743033507, + "loss": 0.0641, + "num_input_tokens_seen": 26186160, + "step": 28950 + }, + { + "epoch": 7.641942721393692, + "grad_norm": 0.002485898556187749, + "learning_rate": 0.05299551042458401, + "loss": 0.015, + "num_input_tokens_seen": 26190800, + "step": 28955 + }, + { + "epoch": 7.643262504949188, + "grad_norm": 0.002570903627201915, + "learning_rate": 0.052950588378144266, + "loss": 0.0128, + "num_input_tokens_seen": 26195312, + "step": 28960 + }, + { + "epoch": 7.644582288504685, + "grad_norm": 0.0011432411847636104, + "learning_rate": 0.052905681297943465, + "loss": 0.0263, + "num_input_tokens_seen": 26199856, + "step": 28965 + }, + { + "epoch": 7.645902072060182, + "grad_norm": 0.002702459692955017, + "learning_rate": 0.0528607891909067, + "loss": 0.0206, + "num_input_tokens_seen": 26204528, + "step": 28970 + }, + { + "epoch": 7.647221855615679, + "grad_norm": 0.0033759099896997213, + "learning_rate": 0.05281591206395697, + "loss": 0.0248, + "num_input_tokens_seen": 26208720, + "step": 28975 + }, + { + "epoch": 7.648541639171176, + "grad_norm": 0.0031534377485513687, + "learning_rate": 0.05277104992401496, + "loss": 0.0433, + "num_input_tokens_seen": 26213200, + "step": 28980 + }, + { + "epoch": 7.649861422726673, + "grad_norm": 0.005515271332114935, + "learning_rate": 0.05272620277799884, + "loss": 0.0471, + "num_input_tokens_seen": 26218128, + "step": 28985 + }, + { + "epoch": 7.651181206282169, + "grad_norm": 0.004917629994452, + "learning_rate": 0.05268137063282473, + "loss": 0.0348, + "num_input_tokens_seen": 26222768, + "step": 28990 + }, + { + "epoch": 7.652500989837667, + "grad_norm": 0.00060180330183357, + "learning_rate": 0.0526365534954062, + "loss": 0.016, + "num_input_tokens_seen": 26227536, + "step": 28995 + }, + { + "epoch": 7.653820773393163, + "grad_norm": 0.0010634344071149826, + "learning_rate": 0.052591751372654656, + "loss": 0.0221, + "num_input_tokens_seen": 26232080, + "step": 29000 + }, + { + "epoch": 7.653820773393163, + "eval_loss": 0.09137992560863495, + "eval_runtime": 75.8204, + "eval_samples_per_second": 88.828, + "eval_steps_per_second": 22.21, + "num_input_tokens_seen": 26232080, + "step": 29000 + }, + { + "epoch": 7.655140556948661, + "grad_norm": 0.00410751486197114, + "learning_rate": 0.05254696427147921, + "loss": 0.0391, + "num_input_tokens_seen": 26236624, + "step": 29005 + }, + { + "epoch": 7.656460340504157, + "grad_norm": 0.0030971658416092396, + "learning_rate": 0.052502192198786546, + "loss": 0.0507, + "num_input_tokens_seen": 26241200, + "step": 29010 + }, + { + "epoch": 7.657780124059654, + "grad_norm": 0.002393806353211403, + "learning_rate": 0.05245743516148103, + "loss": 0.0649, + "num_input_tokens_seen": 26245712, + "step": 29015 + }, + { + "epoch": 7.659099907615151, + "grad_norm": 0.0022828122600913048, + "learning_rate": 0.05241269316646486, + "loss": 0.0735, + "num_input_tokens_seen": 26249968, + "step": 29020 + }, + { + "epoch": 7.660419691170648, + "grad_norm": 0.004122125916182995, + "learning_rate": 0.052367966220637725, + "loss": 0.0256, + "num_input_tokens_seen": 26254480, + "step": 29025 + }, + { + "epoch": 7.661739474726145, + "grad_norm": 0.003666143398731947, + "learning_rate": 0.05232325433089716, + "loss": 0.0335, + "num_input_tokens_seen": 26259024, + "step": 29030 + }, + { + "epoch": 7.663059258281642, + "grad_norm": 0.0008871416794136167, + "learning_rate": 0.052278557504138214, + "loss": 0.0458, + "num_input_tokens_seen": 26263408, + "step": 29035 + }, + { + "epoch": 7.6643790418371385, + "grad_norm": 0.0016295568784698844, + "learning_rate": 0.05223387574725372, + "loss": 0.0192, + "num_input_tokens_seen": 26267792, + "step": 29040 + }, + { + "epoch": 7.665698825392636, + "grad_norm": 0.00027731049340218306, + "learning_rate": 0.05218920906713428, + "loss": 0.0264, + "num_input_tokens_seen": 26272560, + "step": 29045 + }, + { + "epoch": 7.6670186089481325, + "grad_norm": 0.0016860086470842361, + "learning_rate": 0.05214455747066789, + "loss": 0.0557, + "num_input_tokens_seen": 26277232, + "step": 29050 + }, + { + "epoch": 7.668338392503629, + "grad_norm": 0.0013871280243620276, + "learning_rate": 0.05209992096474048, + "loss": 0.0215, + "num_input_tokens_seen": 26281648, + "step": 29055 + }, + { + "epoch": 7.6696581760591265, + "grad_norm": 0.0030177650041878223, + "learning_rate": 0.05205529955623559, + "loss": 0.023, + "num_input_tokens_seen": 26286352, + "step": 29060 + }, + { + "epoch": 7.670977959614623, + "grad_norm": 0.0006622254149988294, + "learning_rate": 0.052010693252034314, + "loss": 0.0267, + "num_input_tokens_seen": 26290896, + "step": 29065 + }, + { + "epoch": 7.6722977431701205, + "grad_norm": 0.0020594371017068624, + "learning_rate": 0.0519661020590156, + "loss": 0.0293, + "num_input_tokens_seen": 26295632, + "step": 29070 + }, + { + "epoch": 7.673617526725617, + "grad_norm": 0.0014327401295304298, + "learning_rate": 0.05192152598405586, + "loss": 0.0145, + "num_input_tokens_seen": 26300176, + "step": 29075 + }, + { + "epoch": 7.674937310281114, + "grad_norm": 0.0006669293507002294, + "learning_rate": 0.05187696503402941, + "loss": 0.0208, + "num_input_tokens_seen": 26304592, + "step": 29080 + }, + { + "epoch": 7.676257093836611, + "grad_norm": 0.0012526216451078653, + "learning_rate": 0.05183241921580798, + "loss": 0.0412, + "num_input_tokens_seen": 26309232, + "step": 29085 + }, + { + "epoch": 7.677576877392108, + "grad_norm": 0.004471329040825367, + "learning_rate": 0.051787888536261206, + "loss": 0.0501, + "num_input_tokens_seen": 26314128, + "step": 29090 + }, + { + "epoch": 7.678896660947604, + "grad_norm": 0.0029950602911412716, + "learning_rate": 0.051743373002256184, + "loss": 0.051, + "num_input_tokens_seen": 26318672, + "step": 29095 + }, + { + "epoch": 7.680216444503102, + "grad_norm": 0.0038894496392458677, + "learning_rate": 0.05169887262065787, + "loss": 0.0603, + "num_input_tokens_seen": 26323216, + "step": 29100 + }, + { + "epoch": 7.681536228058598, + "grad_norm": 0.0022671720944344997, + "learning_rate": 0.051654387398328665, + "loss": 0.0558, + "num_input_tokens_seen": 26327728, + "step": 29105 + }, + { + "epoch": 7.682856011614096, + "grad_norm": 0.002958816010504961, + "learning_rate": 0.05160991734212888, + "loss": 0.0506, + "num_input_tokens_seen": 26332304, + "step": 29110 + }, + { + "epoch": 7.684175795169592, + "grad_norm": 0.00042168746585957706, + "learning_rate": 0.051565462458916224, + "loss": 0.0402, + "num_input_tokens_seen": 26337168, + "step": 29115 + }, + { + "epoch": 7.685495578725089, + "grad_norm": 0.006003579590469599, + "learning_rate": 0.05152102275554627, + "loss": 0.0245, + "num_input_tokens_seen": 26342032, + "step": 29120 + }, + { + "epoch": 7.686815362280586, + "grad_norm": 0.0029154166113585234, + "learning_rate": 0.05147659823887222, + "loss": 0.0493, + "num_input_tokens_seen": 26346512, + "step": 29125 + }, + { + "epoch": 7.688135145836083, + "grad_norm": 0.0063255587592720985, + "learning_rate": 0.05143218891574479, + "loss": 0.0522, + "num_input_tokens_seen": 26351152, + "step": 29130 + }, + { + "epoch": 7.68945492939158, + "grad_norm": 0.0014139515114948153, + "learning_rate": 0.0513877947930125, + "loss": 0.0451, + "num_input_tokens_seen": 26355824, + "step": 29135 + }, + { + "epoch": 7.690774712947077, + "grad_norm": 0.0005379660869948566, + "learning_rate": 0.051343415877521566, + "loss": 0.0218, + "num_input_tokens_seen": 26360752, + "step": 29140 + }, + { + "epoch": 7.692094496502573, + "grad_norm": 0.0034232004545629025, + "learning_rate": 0.051299052176115634, + "loss": 0.0501, + "num_input_tokens_seen": 26365264, + "step": 29145 + }, + { + "epoch": 7.693414280058071, + "grad_norm": 0.0009245671099051833, + "learning_rate": 0.051254703695636256, + "loss": 0.0424, + "num_input_tokens_seen": 26369456, + "step": 29150 + }, + { + "epoch": 7.694734063613567, + "grad_norm": 0.0016563677927479148, + "learning_rate": 0.05121037044292249, + "loss": 0.0652, + "num_input_tokens_seen": 26373968, + "step": 29155 + }, + { + "epoch": 7.696053847169065, + "grad_norm": 0.0020975701045244932, + "learning_rate": 0.05116605242481101, + "loss": 0.0498, + "num_input_tokens_seen": 26378320, + "step": 29160 + }, + { + "epoch": 7.697373630724561, + "grad_norm": 0.004019108600914478, + "learning_rate": 0.05112174964813634, + "loss": 0.0399, + "num_input_tokens_seen": 26382896, + "step": 29165 + }, + { + "epoch": 7.698693414280058, + "grad_norm": 0.001702916924841702, + "learning_rate": 0.05107746211973038, + "loss": 0.0236, + "num_input_tokens_seen": 26387536, + "step": 29170 + }, + { + "epoch": 7.700013197835555, + "grad_norm": 0.0041625178419053555, + "learning_rate": 0.05103318984642291, + "loss": 0.0712, + "num_input_tokens_seen": 26391984, + "step": 29175 + }, + { + "epoch": 7.701332981391052, + "grad_norm": 0.002369555877521634, + "learning_rate": 0.05098893283504131, + "loss": 0.0328, + "num_input_tokens_seen": 26396560, + "step": 29180 + }, + { + "epoch": 7.702652764946548, + "grad_norm": 0.005018937401473522, + "learning_rate": 0.050944691092410475, + "loss": 0.0819, + "num_input_tokens_seen": 26401136, + "step": 29185 + }, + { + "epoch": 7.703972548502046, + "grad_norm": 0.0018572731642052531, + "learning_rate": 0.05090046462535313, + "loss": 0.0267, + "num_input_tokens_seen": 26405808, + "step": 29190 + }, + { + "epoch": 7.705292332057542, + "grad_norm": 0.0019681185949593782, + "learning_rate": 0.050856253440689454, + "loss": 0.0365, + "num_input_tokens_seen": 26410544, + "step": 29195 + }, + { + "epoch": 7.70661211561304, + "grad_norm": 0.003984695300459862, + "learning_rate": 0.050812057545237405, + "loss": 0.037, + "num_input_tokens_seen": 26415344, + "step": 29200 + }, + { + "epoch": 7.70661211561304, + "eval_loss": 0.08802808076143265, + "eval_runtime": 75.844, + "eval_samples_per_second": 88.801, + "eval_steps_per_second": 22.203, + "num_input_tokens_seen": 26415344, + "step": 29200 + }, + { + "epoch": 7.707931899168536, + "grad_norm": 0.001957648666575551, + "learning_rate": 0.0507678769458126, + "loss": 0.0377, + "num_input_tokens_seen": 26419696, + "step": 29205 + }, + { + "epoch": 7.709251682724033, + "grad_norm": 0.0014160872669890523, + "learning_rate": 0.050723711649228155, + "loss": 0.0364, + "num_input_tokens_seen": 26424336, + "step": 29210 + }, + { + "epoch": 7.71057146627953, + "grad_norm": 0.0007324576727114618, + "learning_rate": 0.05067956166229496, + "loss": 0.0235, + "num_input_tokens_seen": 26428944, + "step": 29215 + }, + { + "epoch": 7.711891249835027, + "grad_norm": 0.0022924391087144613, + "learning_rate": 0.05063542699182155, + "loss": 0.0438, + "num_input_tokens_seen": 26433296, + "step": 29220 + }, + { + "epoch": 7.7132110333905235, + "grad_norm": 0.0015502950409427285, + "learning_rate": 0.050591307644613996, + "loss": 0.0534, + "num_input_tokens_seen": 26438064, + "step": 29225 + }, + { + "epoch": 7.714530816946021, + "grad_norm": 0.0012292221654206514, + "learning_rate": 0.05054720362747599, + "loss": 0.0268, + "num_input_tokens_seen": 26442448, + "step": 29230 + }, + { + "epoch": 7.7158506005015175, + "grad_norm": 9.186091483570635e-05, + "learning_rate": 0.050503114947209035, + "loss": 0.0253, + "num_input_tokens_seen": 26446768, + "step": 29235 + }, + { + "epoch": 7.717170384057015, + "grad_norm": 0.0038517946377396584, + "learning_rate": 0.05045904161061207, + "loss": 0.0225, + "num_input_tokens_seen": 26451312, + "step": 29240 + }, + { + "epoch": 7.7184901676125115, + "grad_norm": 0.0040933070704340935, + "learning_rate": 0.05041498362448185, + "loss": 0.0696, + "num_input_tokens_seen": 26456080, + "step": 29245 + }, + { + "epoch": 7.719809951168008, + "grad_norm": 0.0016147632850334048, + "learning_rate": 0.05037094099561256, + "loss": 0.0396, + "num_input_tokens_seen": 26460752, + "step": 29250 + }, + { + "epoch": 7.7211297347235055, + "grad_norm": 0.0011568940244615078, + "learning_rate": 0.05032691373079624, + "loss": 0.0169, + "num_input_tokens_seen": 26465584, + "step": 29255 + }, + { + "epoch": 7.722449518279002, + "grad_norm": 0.0005833404720760882, + "learning_rate": 0.05028290183682234, + "loss": 0.0414, + "num_input_tokens_seen": 26470256, + "step": 29260 + }, + { + "epoch": 7.7237693018344995, + "grad_norm": 0.005345397163182497, + "learning_rate": 0.050238905320478096, + "loss": 0.0662, + "num_input_tokens_seen": 26474992, + "step": 29265 + }, + { + "epoch": 7.725089085389996, + "grad_norm": 0.003768528811633587, + "learning_rate": 0.05019492418854838, + "loss": 0.0763, + "num_input_tokens_seen": 26479600, + "step": 29270 + }, + { + "epoch": 7.726408868945493, + "grad_norm": 0.00048666357179172337, + "learning_rate": 0.05015095844781554, + "loss": 0.043, + "num_input_tokens_seen": 26484304, + "step": 29275 + }, + { + "epoch": 7.72772865250099, + "grad_norm": 0.0010781529126688838, + "learning_rate": 0.05010700810505968, + "loss": 0.0493, + "num_input_tokens_seen": 26488592, + "step": 29280 + }, + { + "epoch": 7.729048436056487, + "grad_norm": 0.0033283315133303404, + "learning_rate": 0.05006307316705856, + "loss": 0.0371, + "num_input_tokens_seen": 26493232, + "step": 29285 + }, + { + "epoch": 7.730368219611984, + "grad_norm": 0.004160208627581596, + "learning_rate": 0.0500191536405874, + "loss": 0.0381, + "num_input_tokens_seen": 26497744, + "step": 29290 + }, + { + "epoch": 7.731688003167481, + "grad_norm": 0.002325113397091627, + "learning_rate": 0.04997524953241922, + "loss": 0.0314, + "num_input_tokens_seen": 26502352, + "step": 29295 + }, + { + "epoch": 7.733007786722977, + "grad_norm": 0.002469372469931841, + "learning_rate": 0.049931360849324556, + "loss": 0.0355, + "num_input_tokens_seen": 26506832, + "step": 29300 + }, + { + "epoch": 7.734327570278475, + "grad_norm": 0.00355321541428566, + "learning_rate": 0.04988748759807155, + "loss": 0.0411, + "num_input_tokens_seen": 26511568, + "step": 29305 + }, + { + "epoch": 7.735647353833971, + "grad_norm": 0.00032892345916479826, + "learning_rate": 0.0498436297854261, + "loss": 0.0388, + "num_input_tokens_seen": 26516208, + "step": 29310 + }, + { + "epoch": 7.736967137389469, + "grad_norm": 0.003410629229620099, + "learning_rate": 0.04979978741815152, + "loss": 0.0253, + "num_input_tokens_seen": 26520752, + "step": 29315 + }, + { + "epoch": 7.738286920944965, + "grad_norm": 0.0026981027331203222, + "learning_rate": 0.04975596050300891, + "loss": 0.0448, + "num_input_tokens_seen": 26524944, + "step": 29320 + }, + { + "epoch": 7.739606704500462, + "grad_norm": 0.0005172598757781088, + "learning_rate": 0.049712149046757005, + "loss": 0.0274, + "num_input_tokens_seen": 26529744, + "step": 29325 + }, + { + "epoch": 7.740926488055959, + "grad_norm": 0.001403618254698813, + "learning_rate": 0.04966835305615194, + "loss": 0.0311, + "num_input_tokens_seen": 26534256, + "step": 29330 + }, + { + "epoch": 7.742246271611456, + "grad_norm": 0.001097221509553492, + "learning_rate": 0.049624572537947755, + "loss": 0.0172, + "num_input_tokens_seen": 26538800, + "step": 29335 + }, + { + "epoch": 7.743566055166952, + "grad_norm": 0.000712823064532131, + "learning_rate": 0.04958080749889582, + "loss": 0.0107, + "num_input_tokens_seen": 26543216, + "step": 29340 + }, + { + "epoch": 7.74488583872245, + "grad_norm": 0.0024987186770886183, + "learning_rate": 0.049537057945745304, + "loss": 0.0454, + "num_input_tokens_seen": 26547824, + "step": 29345 + }, + { + "epoch": 7.746205622277946, + "grad_norm": 0.0001405519578838721, + "learning_rate": 0.049493323885243, + "loss": 0.0273, + "num_input_tokens_seen": 26552432, + "step": 29350 + }, + { + "epoch": 7.747525405833443, + "grad_norm": 0.0002725540834944695, + "learning_rate": 0.04944960532413318, + "loss": 0.0221, + "num_input_tokens_seen": 26556784, + "step": 29355 + }, + { + "epoch": 7.74884518938894, + "grad_norm": 0.0034262570552527905, + "learning_rate": 0.049405902269157774, + "loss": 0.0356, + "num_input_tokens_seen": 26561456, + "step": 29360 + }, + { + "epoch": 7.750164972944437, + "grad_norm": 0.0005358330090530217, + "learning_rate": 0.04936221472705646, + "loss": 0.0369, + "num_input_tokens_seen": 26565808, + "step": 29365 + }, + { + "epoch": 7.751484756499934, + "grad_norm": 0.00041891649016179144, + "learning_rate": 0.04931854270456632, + "loss": 0.0113, + "num_input_tokens_seen": 26570512, + "step": 29370 + }, + { + "epoch": 7.752804540055431, + "grad_norm": 0.0014687584480270743, + "learning_rate": 0.049274886208422075, + "loss": 0.0171, + "num_input_tokens_seen": 26575184, + "step": 29375 + }, + { + "epoch": 7.754124323610927, + "grad_norm": 0.0008749645203351974, + "learning_rate": 0.049231245245356235, + "loss": 0.0212, + "num_input_tokens_seen": 26579632, + "step": 29380 + }, + { + "epoch": 7.755444107166425, + "grad_norm": 0.0023547904565930367, + "learning_rate": 0.049187619822098655, + "loss": 0.0231, + "num_input_tokens_seen": 26584112, + "step": 29385 + }, + { + "epoch": 7.756763890721921, + "grad_norm": 0.0006649349234066904, + "learning_rate": 0.04914400994537705, + "loss": 0.024, + "num_input_tokens_seen": 26588528, + "step": 29390 + }, + { + "epoch": 7.758083674277419, + "grad_norm": 0.0018148665549233556, + "learning_rate": 0.049100415621916485, + "loss": 0.0351, + "num_input_tokens_seen": 26592976, + "step": 29395 + }, + { + "epoch": 7.759403457832915, + "grad_norm": 0.0031159305945038795, + "learning_rate": 0.04905683685843981, + "loss": 0.0199, + "num_input_tokens_seen": 26597616, + "step": 29400 + }, + { + "epoch": 7.759403457832915, + "eval_loss": 0.09340815991163254, + "eval_runtime": 75.9823, + "eval_samples_per_second": 88.639, + "eval_steps_per_second": 22.163, + "num_input_tokens_seen": 26597616, + "step": 29400 + }, + { + "epoch": 7.760723241388412, + "grad_norm": 0.0040558986365795135, + "learning_rate": 0.049013273661667495, + "loss": 0.0348, + "num_input_tokens_seen": 26602416, + "step": 29405 + }, + { + "epoch": 7.762043024943909, + "grad_norm": 0.002606996102258563, + "learning_rate": 0.048969726038317396, + "loss": 0.0856, + "num_input_tokens_seen": 26606864, + "step": 29410 + }, + { + "epoch": 7.763362808499406, + "grad_norm": 0.004243878647685051, + "learning_rate": 0.048926193995105206, + "loss": 0.047, + "num_input_tokens_seen": 26611376, + "step": 29415 + }, + { + "epoch": 7.764682592054903, + "grad_norm": 0.0024031379725784063, + "learning_rate": 0.048882677538744035, + "loss": 0.0258, + "num_input_tokens_seen": 26615888, + "step": 29420 + }, + { + "epoch": 7.7660023756104, + "grad_norm": 0.0005760182975791395, + "learning_rate": 0.048839176675944715, + "loss": 0.0151, + "num_input_tokens_seen": 26620464, + "step": 29425 + }, + { + "epoch": 7.7673221591658965, + "grad_norm": 0.0014238886069506407, + "learning_rate": 0.04879569141341566, + "loss": 0.052, + "num_input_tokens_seen": 26624944, + "step": 29430 + }, + { + "epoch": 7.768641942721394, + "grad_norm": 0.00691070593893528, + "learning_rate": 0.04875222175786274, + "loss": 0.0248, + "num_input_tokens_seen": 26629552, + "step": 29435 + }, + { + "epoch": 7.7699617262768905, + "grad_norm": 0.0013433505082502961, + "learning_rate": 0.04870876771598966, + "loss": 0.0277, + "num_input_tokens_seen": 26634320, + "step": 29440 + }, + { + "epoch": 7.771281509832388, + "grad_norm": 0.0009385973680764437, + "learning_rate": 0.04866532929449744, + "loss": 0.0412, + "num_input_tokens_seen": 26638800, + "step": 29445 + }, + { + "epoch": 7.7726012933878845, + "grad_norm": 0.0009823464788496494, + "learning_rate": 0.048621906500084945, + "loss": 0.0475, + "num_input_tokens_seen": 26643184, + "step": 29450 + }, + { + "epoch": 7.773921076943381, + "grad_norm": 0.0014615209074690938, + "learning_rate": 0.04857849933944845, + "loss": 0.0212, + "num_input_tokens_seen": 26647664, + "step": 29455 + }, + { + "epoch": 7.7752408604988785, + "grad_norm": 0.002743249759078026, + "learning_rate": 0.048535107819281866, + "loss": 0.0485, + "num_input_tokens_seen": 26652336, + "step": 29460 + }, + { + "epoch": 7.776560644054375, + "grad_norm": 0.0016890057595446706, + "learning_rate": 0.04849173194627675, + "loss": 0.0293, + "num_input_tokens_seen": 26656880, + "step": 29465 + }, + { + "epoch": 7.777880427609872, + "grad_norm": 0.0015489091165363789, + "learning_rate": 0.04844837172712223, + "loss": 0.0409, + "num_input_tokens_seen": 26661488, + "step": 29470 + }, + { + "epoch": 7.779200211165369, + "grad_norm": 0.0020744926296174526, + "learning_rate": 0.04840502716850494, + "loss": 0.0376, + "num_input_tokens_seen": 26666000, + "step": 29475 + }, + { + "epoch": 7.780519994720866, + "grad_norm": 0.0013451561098918319, + "learning_rate": 0.04836169827710916, + "loss": 0.0488, + "num_input_tokens_seen": 26670704, + "step": 29480 + }, + { + "epoch": 7.781839778276362, + "grad_norm": 0.0003046517667826265, + "learning_rate": 0.04831838505961684, + "loss": 0.0515, + "num_input_tokens_seen": 26675280, + "step": 29485 + }, + { + "epoch": 7.78315956183186, + "grad_norm": 0.0015079948352649808, + "learning_rate": 0.048275087522707295, + "loss": 0.0397, + "num_input_tokens_seen": 26679696, + "step": 29490 + }, + { + "epoch": 7.784479345387356, + "grad_norm": 0.0015226738760247827, + "learning_rate": 0.04823180567305766, + "loss": 0.0524, + "num_input_tokens_seen": 26684624, + "step": 29495 + }, + { + "epoch": 7.785799128942854, + "grad_norm": 0.0014221605379134417, + "learning_rate": 0.04818853951734244, + "loss": 0.0447, + "num_input_tokens_seen": 26689264, + "step": 29500 + }, + { + "epoch": 7.78711891249835, + "grad_norm": 0.0013156917411834002, + "learning_rate": 0.04814528906223387, + "loss": 0.0481, + "num_input_tokens_seen": 26693520, + "step": 29505 + }, + { + "epoch": 7.788438696053847, + "grad_norm": 0.0009936400456354022, + "learning_rate": 0.04810205431440177, + "loss": 0.0209, + "num_input_tokens_seen": 26698000, + "step": 29510 + }, + { + "epoch": 7.789758479609344, + "grad_norm": 0.004379719495773315, + "learning_rate": 0.04805883528051341, + "loss": 0.077, + "num_input_tokens_seen": 26702544, + "step": 29515 + }, + { + "epoch": 7.791078263164841, + "grad_norm": 0.004340080544352531, + "learning_rate": 0.048015631967233685, + "loss": 0.0417, + "num_input_tokens_seen": 26706992, + "step": 29520 + }, + { + "epoch": 7.792398046720338, + "grad_norm": 0.004755521658807993, + "learning_rate": 0.04797244438122517, + "loss": 0.0298, + "num_input_tokens_seen": 26711600, + "step": 29525 + }, + { + "epoch": 7.793717830275835, + "grad_norm": 0.0010646097362041473, + "learning_rate": 0.04792927252914784, + "loss": 0.0271, + "num_input_tokens_seen": 26716112, + "step": 29530 + }, + { + "epoch": 7.795037613831331, + "grad_norm": 0.0010636256774887443, + "learning_rate": 0.04788611641765944, + "loss": 0.0297, + "num_input_tokens_seen": 26720944, + "step": 29535 + }, + { + "epoch": 7.796357397386829, + "grad_norm": 0.000997768947854638, + "learning_rate": 0.04784297605341508, + "loss": 0.0346, + "num_input_tokens_seen": 26725808, + "step": 29540 + }, + { + "epoch": 7.797677180942325, + "grad_norm": 0.0040373713709414005, + "learning_rate": 0.04779985144306761, + "loss": 0.0503, + "num_input_tokens_seen": 26730352, + "step": 29545 + }, + { + "epoch": 7.798996964497823, + "grad_norm": 0.005021179094910622, + "learning_rate": 0.047756742593267405, + "loss": 0.0165, + "num_input_tokens_seen": 26735056, + "step": 29550 + }, + { + "epoch": 7.800316748053319, + "grad_norm": 7.187260052887723e-05, + "learning_rate": 0.047713649510662315, + "loss": 0.0233, + "num_input_tokens_seen": 26739696, + "step": 29555 + }, + { + "epoch": 7.801636531608816, + "grad_norm": 0.00506534893065691, + "learning_rate": 0.04767057220189789, + "loss": 0.0455, + "num_input_tokens_seen": 26744144, + "step": 29560 + }, + { + "epoch": 7.802956315164313, + "grad_norm": 0.0019900843035429716, + "learning_rate": 0.04762751067361722, + "loss": 0.0157, + "num_input_tokens_seen": 26748848, + "step": 29565 + }, + { + "epoch": 7.80427609871981, + "grad_norm": 0.0015375264920294285, + "learning_rate": 0.04758446493246086, + "loss": 0.0545, + "num_input_tokens_seen": 26753008, + "step": 29570 + }, + { + "epoch": 7.805595882275307, + "grad_norm": 0.0018587313825264573, + "learning_rate": 0.047541434985067084, + "loss": 0.0644, + "num_input_tokens_seen": 26757200, + "step": 29575 + }, + { + "epoch": 7.806915665830804, + "grad_norm": 0.003621147945523262, + "learning_rate": 0.047498420838071556, + "loss": 0.029, + "num_input_tokens_seen": 26761456, + "step": 29580 + }, + { + "epoch": 7.8082354493863, + "grad_norm": 0.0030867934692651033, + "learning_rate": 0.04745542249810772, + "loss": 0.0309, + "num_input_tokens_seen": 26765744, + "step": 29585 + }, + { + "epoch": 7.809555232941798, + "grad_norm": 0.001412521000020206, + "learning_rate": 0.047412439971806324, + "loss": 0.0327, + "num_input_tokens_seen": 26770480, + "step": 29590 + }, + { + "epoch": 7.810875016497294, + "grad_norm": 0.0011383064556866884, + "learning_rate": 0.04736947326579592, + "loss": 0.0463, + "num_input_tokens_seen": 26775120, + "step": 29595 + }, + { + "epoch": 7.812194800052791, + "grad_norm": 0.0013751451624557376, + "learning_rate": 0.04732652238670245, + "loss": 0.0197, + "num_input_tokens_seen": 26779344, + "step": 29600 + }, + { + "epoch": 7.812194800052791, + "eval_loss": 0.09160458296537399, + "eval_runtime": 75.7648, + "eval_samples_per_second": 88.894, + "eval_steps_per_second": 22.227, + "num_input_tokens_seen": 26779344, + "step": 29600 + }, + { + "epoch": 7.813514583608288, + "grad_norm": 0.0021089084912091494, + "learning_rate": 0.04728358734114952, + "loss": 0.0263, + "num_input_tokens_seen": 26783984, + "step": 29605 + }, + { + "epoch": 7.814834367163785, + "grad_norm": 0.0013179779052734375, + "learning_rate": 0.04724066813575821, + "loss": 0.0273, + "num_input_tokens_seen": 26788240, + "step": 29610 + }, + { + "epoch": 7.8161541507192815, + "grad_norm": 0.0032580264378339052, + "learning_rate": 0.04719776477714729, + "loss": 0.038, + "num_input_tokens_seen": 26792784, + "step": 29615 + }, + { + "epoch": 7.817473934274779, + "grad_norm": 0.0026784061919897795, + "learning_rate": 0.047154877271932856, + "loss": 0.0232, + "num_input_tokens_seen": 26797328, + "step": 29620 + }, + { + "epoch": 7.8187937178302755, + "grad_norm": 0.003676908789202571, + "learning_rate": 0.0471120056267288, + "loss": 0.0315, + "num_input_tokens_seen": 26801808, + "step": 29625 + }, + { + "epoch": 7.820113501385773, + "grad_norm": 0.0019569452852010727, + "learning_rate": 0.047069149848146495, + "loss": 0.0286, + "num_input_tokens_seen": 26806448, + "step": 29630 + }, + { + "epoch": 7.8214332849412695, + "grad_norm": 0.00014988431939855218, + "learning_rate": 0.04702630994279473, + "loss": 0.0308, + "num_input_tokens_seen": 26811216, + "step": 29635 + }, + { + "epoch": 7.822753068496766, + "grad_norm": 0.0072694034315645695, + "learning_rate": 0.046983485917280035, + "loss": 0.0368, + "num_input_tokens_seen": 26815472, + "step": 29640 + }, + { + "epoch": 7.8240728520522635, + "grad_norm": 0.0051079425029456615, + "learning_rate": 0.04694067777820644, + "loss": 0.0469, + "num_input_tokens_seen": 26820304, + "step": 29645 + }, + { + "epoch": 7.82539263560776, + "grad_norm": 0.004545792005956173, + "learning_rate": 0.046897885532175415, + "loss": 0.0434, + "num_input_tokens_seen": 26824496, + "step": 29650 + }, + { + "epoch": 7.8267124191632576, + "grad_norm": 0.0017107267631217837, + "learning_rate": 0.04685510918578613, + "loss": 0.0366, + "num_input_tokens_seen": 26828944, + "step": 29655 + }, + { + "epoch": 7.828032202718754, + "grad_norm": 0.0009755035280250013, + "learning_rate": 0.04681234874563519, + "loss": 0.0498, + "num_input_tokens_seen": 26833456, + "step": 29660 + }, + { + "epoch": 7.829351986274251, + "grad_norm": 0.0023002170491963625, + "learning_rate": 0.046769604218316836, + "loss": 0.0287, + "num_input_tokens_seen": 26837936, + "step": 29665 + }, + { + "epoch": 7.830671769829748, + "grad_norm": 0.0007540474180132151, + "learning_rate": 0.04672687561042279, + "loss": 0.0311, + "num_input_tokens_seen": 26842416, + "step": 29670 + }, + { + "epoch": 7.831991553385245, + "grad_norm": 0.0019664315041154623, + "learning_rate": 0.046684162928542286, + "loss": 0.0471, + "num_input_tokens_seen": 26847024, + "step": 29675 + }, + { + "epoch": 7.833311336940742, + "grad_norm": 0.0022987353149801493, + "learning_rate": 0.04664146617926222, + "loss": 0.0333, + "num_input_tokens_seen": 26851536, + "step": 29680 + }, + { + "epoch": 7.834631120496239, + "grad_norm": 0.001371324877254665, + "learning_rate": 0.046598785369167, + "loss": 0.0228, + "num_input_tokens_seen": 26855856, + "step": 29685 + }, + { + "epoch": 7.835950904051735, + "grad_norm": 0.0014950664481148124, + "learning_rate": 0.046556120504838434, + "loss": 0.0135, + "num_input_tokens_seen": 26860592, + "step": 29690 + }, + { + "epoch": 7.837270687607233, + "grad_norm": 0.00414616335183382, + "learning_rate": 0.04651347159285609, + "loss": 0.0301, + "num_input_tokens_seen": 26865008, + "step": 29695 + }, + { + "epoch": 7.838590471162729, + "grad_norm": 0.0021644870284944773, + "learning_rate": 0.04647083863979688, + "loss": 0.0412, + "num_input_tokens_seen": 26869392, + "step": 29700 + }, + { + "epoch": 7.839910254718227, + "grad_norm": 0.0024922701995819807, + "learning_rate": 0.04642822165223538, + "loss": 0.0132, + "num_input_tokens_seen": 26873776, + "step": 29705 + }, + { + "epoch": 7.841230038273723, + "grad_norm": 0.0004336656420491636, + "learning_rate": 0.046385620636743716, + "loss": 0.019, + "num_input_tokens_seen": 26878448, + "step": 29710 + }, + { + "epoch": 7.84254982182922, + "grad_norm": 0.00412560161203146, + "learning_rate": 0.04634303559989141, + "loss": 0.0517, + "num_input_tokens_seen": 26882992, + "step": 29715 + }, + { + "epoch": 7.843869605384717, + "grad_norm": 0.0022918032482266426, + "learning_rate": 0.046300466548245635, + "loss": 0.0363, + "num_input_tokens_seen": 26887888, + "step": 29720 + }, + { + "epoch": 7.845189388940214, + "grad_norm": 0.0032141993287950754, + "learning_rate": 0.04625791348837114, + "loss": 0.0188, + "num_input_tokens_seen": 26892592, + "step": 29725 + }, + { + "epoch": 7.84650917249571, + "grad_norm": 0.001934496103785932, + "learning_rate": 0.046215376426830095, + "loss": 0.0402, + "num_input_tokens_seen": 26896720, + "step": 29730 + }, + { + "epoch": 7.847828956051208, + "grad_norm": 0.005961848422884941, + "learning_rate": 0.04617285537018219, + "loss": 0.0893, + "num_input_tokens_seen": 26901424, + "step": 29735 + }, + { + "epoch": 7.849148739606704, + "grad_norm": 0.0019493636209517717, + "learning_rate": 0.046130350324984803, + "loss": 0.0236, + "num_input_tokens_seen": 26906096, + "step": 29740 + }, + { + "epoch": 7.850468523162202, + "grad_norm": 0.0002922543208114803, + "learning_rate": 0.046087861297792666, + "loss": 0.0153, + "num_input_tokens_seen": 26910544, + "step": 29745 + }, + { + "epoch": 7.851788306717698, + "grad_norm": 0.00220900634303689, + "learning_rate": 0.0460453882951582, + "loss": 0.0301, + "num_input_tokens_seen": 26915056, + "step": 29750 + }, + { + "epoch": 7.853108090273195, + "grad_norm": 0.0006066165515221655, + "learning_rate": 0.04600293132363119, + "loss": 0.0295, + "num_input_tokens_seen": 26919408, + "step": 29755 + }, + { + "epoch": 7.854427873828692, + "grad_norm": 0.0007819513557478786, + "learning_rate": 0.045960490389759086, + "loss": 0.0179, + "num_input_tokens_seen": 26923824, + "step": 29760 + }, + { + "epoch": 7.855747657384189, + "grad_norm": 0.000601111096329987, + "learning_rate": 0.04591806550008685, + "loss": 0.0268, + "num_input_tokens_seen": 26928304, + "step": 29765 + }, + { + "epoch": 7.8570674409396855, + "grad_norm": 0.0013546457048505545, + "learning_rate": 0.045875656661156825, + "loss": 0.0254, + "num_input_tokens_seen": 26932848, + "step": 29770 + }, + { + "epoch": 7.858387224495183, + "grad_norm": 0.000247833231696859, + "learning_rate": 0.04583326387950911, + "loss": 0.0191, + "num_input_tokens_seen": 26937552, + "step": 29775 + }, + { + "epoch": 7.8597070080506795, + "grad_norm": 0.0046968283131718636, + "learning_rate": 0.0457908871616811, + "loss": 0.0753, + "num_input_tokens_seen": 26941936, + "step": 29780 + }, + { + "epoch": 7.861026791606177, + "grad_norm": 0.00039958799607120454, + "learning_rate": 0.04574852651420786, + "loss": 0.0279, + "num_input_tokens_seen": 26946672, + "step": 29785 + }, + { + "epoch": 7.8623465751616735, + "grad_norm": 0.0006816998356953263, + "learning_rate": 0.045706181943621985, + "loss": 0.0388, + "num_input_tokens_seen": 26951152, + "step": 29790 + }, + { + "epoch": 7.86366635871717, + "grad_norm": 0.005351299419999123, + "learning_rate": 0.04566385345645344, + "loss": 0.0271, + "num_input_tokens_seen": 26955824, + "step": 29795 + }, + { + "epoch": 7.8649861422726675, + "grad_norm": 0.0010314843384549022, + "learning_rate": 0.04562154105922993, + "loss": 0.0293, + "num_input_tokens_seen": 26960208, + "step": 29800 + }, + { + "epoch": 7.8649861422726675, + "eval_loss": 0.09360597282648087, + "eval_runtime": 75.9337, + "eval_samples_per_second": 88.696, + "eval_steps_per_second": 22.177, + "num_input_tokens_seen": 26960208, + "step": 29800 + }, + { + "epoch": 7.866305925828164, + "grad_norm": 0.003386810189113021, + "learning_rate": 0.04557924475847642, + "loss": 0.0482, + "num_input_tokens_seen": 26964592, + "step": 29805 + }, + { + "epoch": 7.8676257093836615, + "grad_norm": 0.0020817876793444157, + "learning_rate": 0.04553696456071567, + "loss": 0.0557, + "num_input_tokens_seen": 26968976, + "step": 29810 + }, + { + "epoch": 7.868945492939158, + "grad_norm": 0.0015091580571606755, + "learning_rate": 0.045494700472467724, + "loss": 0.0305, + "num_input_tokens_seen": 26973392, + "step": 29815 + }, + { + "epoch": 7.870265276494655, + "grad_norm": 0.0026621974539011717, + "learning_rate": 0.04545245250025024, + "loss": 0.0223, + "num_input_tokens_seen": 26977968, + "step": 29820 + }, + { + "epoch": 7.871585060050152, + "grad_norm": 0.002103799721226096, + "learning_rate": 0.045410220650578384, + "loss": 0.0285, + "num_input_tokens_seen": 26982736, + "step": 29825 + }, + { + "epoch": 7.872904843605649, + "grad_norm": 0.0036915463861078024, + "learning_rate": 0.04536800492996492, + "loss": 0.0399, + "num_input_tokens_seen": 26987536, + "step": 29830 + }, + { + "epoch": 7.874224627161146, + "grad_norm": 0.0033389234449714422, + "learning_rate": 0.04532580534491994, + "loss": 0.0322, + "num_input_tokens_seen": 26991984, + "step": 29835 + }, + { + "epoch": 7.875544410716643, + "grad_norm": 0.0020859113428741693, + "learning_rate": 0.045283621901951183, + "loss": 0.0273, + "num_input_tokens_seen": 26996336, + "step": 29840 + }, + { + "epoch": 7.876864194272139, + "grad_norm": 0.0026738066226243973, + "learning_rate": 0.04524145460756393, + "loss": 0.0411, + "num_input_tokens_seen": 27000912, + "step": 29845 + }, + { + "epoch": 7.878183977827637, + "grad_norm": 0.0063478099182248116, + "learning_rate": 0.045199303468260794, + "loss": 0.1177, + "num_input_tokens_seen": 27005200, + "step": 29850 + }, + { + "epoch": 7.879503761383133, + "grad_norm": 0.00017266097711399198, + "learning_rate": 0.04515716849054214, + "loss": 0.0218, + "num_input_tokens_seen": 27009712, + "step": 29855 + }, + { + "epoch": 7.88082354493863, + "grad_norm": 0.0006901527522131801, + "learning_rate": 0.04511504968090558, + "loss": 0.0452, + "num_input_tokens_seen": 27014032, + "step": 29860 + }, + { + "epoch": 7.882143328494127, + "grad_norm": 0.0014610018115490675, + "learning_rate": 0.04507294704584644, + "loss": 0.0234, + "num_input_tokens_seen": 27018512, + "step": 29865 + }, + { + "epoch": 7.883463112049624, + "grad_norm": 0.0044056144542992115, + "learning_rate": 0.04503086059185749, + "loss": 0.0468, + "num_input_tokens_seen": 27022992, + "step": 29870 + }, + { + "epoch": 7.884782895605121, + "grad_norm": 0.00046793429646641016, + "learning_rate": 0.04498879032542893, + "loss": 0.0528, + "num_input_tokens_seen": 27027632, + "step": 29875 + }, + { + "epoch": 7.886102679160618, + "grad_norm": 0.0012379779946058989, + "learning_rate": 0.0449467362530486, + "loss": 0.0162, + "num_input_tokens_seen": 27032016, + "step": 29880 + }, + { + "epoch": 7.887422462716114, + "grad_norm": 0.002805018564686179, + "learning_rate": 0.04490469838120171, + "loss": 0.0542, + "num_input_tokens_seen": 27036784, + "step": 29885 + }, + { + "epoch": 7.888742246271612, + "grad_norm": 0.005477383732795715, + "learning_rate": 0.04486267671637101, + "loss": 0.0459, + "num_input_tokens_seen": 27041584, + "step": 29890 + }, + { + "epoch": 7.890062029827108, + "grad_norm": 0.00047333622933365405, + "learning_rate": 0.04482067126503683, + "loss": 0.0368, + "num_input_tokens_seen": 27046384, + "step": 29895 + }, + { + "epoch": 7.891381813382605, + "grad_norm": 0.0008870005840435624, + "learning_rate": 0.04477868203367687, + "loss": 0.0245, + "num_input_tokens_seen": 27051120, + "step": 29900 + }, + { + "epoch": 7.892701596938102, + "grad_norm": 0.0015904313186183572, + "learning_rate": 0.044736709028766426, + "loss": 0.0572, + "num_input_tokens_seen": 27055696, + "step": 29905 + }, + { + "epoch": 7.894021380493599, + "grad_norm": 0.002954307245090604, + "learning_rate": 0.04469475225677832, + "loss": 0.0292, + "num_input_tokens_seen": 27060144, + "step": 29910 + }, + { + "epoch": 7.895341164049096, + "grad_norm": 0.003842963371425867, + "learning_rate": 0.04465281172418273, + "loss": 0.0346, + "num_input_tokens_seen": 27064432, + "step": 29915 + }, + { + "epoch": 7.896660947604593, + "grad_norm": 0.0004596865037456155, + "learning_rate": 0.044610887437447476, + "loss": 0.0164, + "num_input_tokens_seen": 27068720, + "step": 29920 + }, + { + "epoch": 7.897980731160089, + "grad_norm": 0.0024469150230288506, + "learning_rate": 0.044568979403037744, + "loss": 0.0341, + "num_input_tokens_seen": 27073520, + "step": 29925 + }, + { + "epoch": 7.899300514715587, + "grad_norm": 0.003446771763265133, + "learning_rate": 0.04452708762741631, + "loss": 0.0255, + "num_input_tokens_seen": 27077968, + "step": 29930 + }, + { + "epoch": 7.900620298271083, + "grad_norm": 0.0024367577861994505, + "learning_rate": 0.044485212117043475, + "loss": 0.0298, + "num_input_tokens_seen": 27082576, + "step": 29935 + }, + { + "epoch": 7.901940081826581, + "grad_norm": 0.005697181914001703, + "learning_rate": 0.04444335287837687, + "loss": 0.047, + "num_input_tokens_seen": 27087440, + "step": 29940 + }, + { + "epoch": 7.903259865382077, + "grad_norm": 0.0003762580454349518, + "learning_rate": 0.04440150991787179, + "loss": 0.0133, + "num_input_tokens_seen": 27092176, + "step": 29945 + }, + { + "epoch": 7.904579648937574, + "grad_norm": 0.0024676136672496796, + "learning_rate": 0.04435968324198088, + "loss": 0.0172, + "num_input_tokens_seen": 27097136, + "step": 29950 + }, + { + "epoch": 7.905899432493071, + "grad_norm": 0.0017613974632695317, + "learning_rate": 0.04431787285715442, + "loss": 0.0291, + "num_input_tokens_seen": 27101488, + "step": 29955 + }, + { + "epoch": 7.907219216048568, + "grad_norm": 0.0019779163412749767, + "learning_rate": 0.04427607876984004, + "loss": 0.0336, + "num_input_tokens_seen": 27106224, + "step": 29960 + }, + { + "epoch": 7.908538999604065, + "grad_norm": 0.001626770244911313, + "learning_rate": 0.044234300986482886, + "loss": 0.0293, + "num_input_tokens_seen": 27110864, + "step": 29965 + }, + { + "epoch": 7.909858783159562, + "grad_norm": 0.0034999181516468525, + "learning_rate": 0.04419253951352566, + "loss": 0.0191, + "num_input_tokens_seen": 27115152, + "step": 29970 + }, + { + "epoch": 7.9111785667150585, + "grad_norm": 0.003665482858195901, + "learning_rate": 0.044150794357408533, + "loss": 0.0459, + "num_input_tokens_seen": 27119760, + "step": 29975 + }, + { + "epoch": 7.912498350270556, + "grad_norm": 0.002841478679329157, + "learning_rate": 0.044109065524569065, + "loss": 0.0398, + "num_input_tokens_seen": 27124240, + "step": 29980 + }, + { + "epoch": 7.9138181338260525, + "grad_norm": 0.0006907469942234457, + "learning_rate": 0.0440673530214424, + "loss": 0.0372, + "num_input_tokens_seen": 27128816, + "step": 29985 + }, + { + "epoch": 7.915137917381549, + "grad_norm": 0.00030794148915447295, + "learning_rate": 0.04402565685446117, + "loss": 0.0153, + "num_input_tokens_seen": 27133328, + "step": 29990 + }, + { + "epoch": 7.9164577009370465, + "grad_norm": 0.003118197899311781, + "learning_rate": 0.04398397703005536, + "loss": 0.0375, + "num_input_tokens_seen": 27137936, + "step": 29995 + }, + { + "epoch": 7.917777484492543, + "grad_norm": 0.0016517132753506303, + "learning_rate": 0.043942313554652626, + "loss": 0.012, + "num_input_tokens_seen": 27142320, + "step": 30000 + }, + { + "epoch": 7.917777484492543, + "eval_loss": 0.09325231611728668, + "eval_runtime": 75.8883, + "eval_samples_per_second": 88.749, + "eval_steps_per_second": 22.191, + "num_input_tokens_seen": 27142320, + "step": 30000 + }, + { + "epoch": 7.9190972680480405, + "grad_norm": 0.00123637355864048, + "learning_rate": 0.0439006664346779, + "loss": 0.0267, + "num_input_tokens_seen": 27146768, + "step": 30005 + }, + { + "epoch": 7.920417051603537, + "grad_norm": 0.002332353265956044, + "learning_rate": 0.043859035676553755, + "loss": 0.0264, + "num_input_tokens_seen": 27151440, + "step": 30010 + }, + { + "epoch": 7.921736835159034, + "grad_norm": 0.004806251265108585, + "learning_rate": 0.043817421286700194, + "loss": 0.0329, + "num_input_tokens_seen": 27155696, + "step": 30015 + }, + { + "epoch": 7.923056618714531, + "grad_norm": 0.0054399105720222, + "learning_rate": 0.043775823271534585, + "loss": 0.0485, + "num_input_tokens_seen": 27160016, + "step": 30020 + }, + { + "epoch": 7.924376402270028, + "grad_norm": 0.0010895384475588799, + "learning_rate": 0.04373424163747197, + "loss": 0.0163, + "num_input_tokens_seen": 27164656, + "step": 30025 + }, + { + "epoch": 7.925696185825524, + "grad_norm": 0.0004917655023746192, + "learning_rate": 0.04369267639092473, + "loss": 0.0322, + "num_input_tokens_seen": 27169168, + "step": 30030 + }, + { + "epoch": 7.927015969381022, + "grad_norm": 0.0001470383140258491, + "learning_rate": 0.04365112753830268, + "loss": 0.0199, + "num_input_tokens_seen": 27173808, + "step": 30035 + }, + { + "epoch": 7.928335752936518, + "grad_norm": 0.0003403414157219231, + "learning_rate": 0.04360959508601327, + "loss": 0.0078, + "num_input_tokens_seen": 27178480, + "step": 30040 + }, + { + "epoch": 7.929655536492016, + "grad_norm": 0.002687410917133093, + "learning_rate": 0.04356807904046123, + "loss": 0.0193, + "num_input_tokens_seen": 27182672, + "step": 30045 + }, + { + "epoch": 7.930975320047512, + "grad_norm": 0.0005623701144941151, + "learning_rate": 0.04352657940804892, + "loss": 0.0076, + "num_input_tokens_seen": 27186800, + "step": 30050 + }, + { + "epoch": 7.932295103603009, + "grad_norm": 0.0006435487885028124, + "learning_rate": 0.04348509619517613, + "loss": 0.009, + "num_input_tokens_seen": 27191472, + "step": 30055 + }, + { + "epoch": 7.933614887158506, + "grad_norm": 0.0010565355187281966, + "learning_rate": 0.04344362940824002, + "loss": 0.0594, + "num_input_tokens_seen": 27196144, + "step": 30060 + }, + { + "epoch": 7.934934670714003, + "grad_norm": 0.0005938952672295272, + "learning_rate": 0.04340217905363533, + "loss": 0.0428, + "num_input_tokens_seen": 27200432, + "step": 30065 + }, + { + "epoch": 7.9362544542695, + "grad_norm": 0.00761384004727006, + "learning_rate": 0.04336074513775425, + "loss": 0.096, + "num_input_tokens_seen": 27204560, + "step": 30070 + }, + { + "epoch": 7.937574237824997, + "grad_norm": 0.00017104388098232448, + "learning_rate": 0.04331932766698636, + "loss": 0.0146, + "num_input_tokens_seen": 27209040, + "step": 30075 + }, + { + "epoch": 7.938894021380493, + "grad_norm": 0.002747097285464406, + "learning_rate": 0.0432779266477188, + "loss": 0.0301, + "num_input_tokens_seen": 27213456, + "step": 30080 + }, + { + "epoch": 7.940213804935991, + "grad_norm": 0.002810270292684436, + "learning_rate": 0.04323654208633607, + "loss": 0.0443, + "num_input_tokens_seen": 27217936, + "step": 30085 + }, + { + "epoch": 7.941533588491487, + "grad_norm": 0.0002521109418012202, + "learning_rate": 0.04319517398922024, + "loss": 0.0127, + "num_input_tokens_seen": 27222672, + "step": 30090 + }, + { + "epoch": 7.942853372046985, + "grad_norm": 0.0004723399761132896, + "learning_rate": 0.04315382236275079, + "loss": 0.0112, + "num_input_tokens_seen": 27227248, + "step": 30095 + }, + { + "epoch": 7.944173155602481, + "grad_norm": 0.00036342389648780227, + "learning_rate": 0.043112487213304664, + "loss": 0.02, + "num_input_tokens_seen": 27231856, + "step": 30100 + }, + { + "epoch": 7.945492939157978, + "grad_norm": 0.003920980729162693, + "learning_rate": 0.04307116854725618, + "loss": 0.0268, + "num_input_tokens_seen": 27236240, + "step": 30105 + }, + { + "epoch": 7.946812722713475, + "grad_norm": 0.000366397260222584, + "learning_rate": 0.043029866370977325, + "loss": 0.0341, + "num_input_tokens_seen": 27240912, + "step": 30110 + }, + { + "epoch": 7.948132506268972, + "grad_norm": 0.0001758492726366967, + "learning_rate": 0.04298858069083728, + "loss": 0.0425, + "num_input_tokens_seen": 27245488, + "step": 30115 + }, + { + "epoch": 7.949452289824469, + "grad_norm": 0.003168045310303569, + "learning_rate": 0.04294731151320295, + "loss": 0.0552, + "num_input_tokens_seen": 27250032, + "step": 30120 + }, + { + "epoch": 7.950772073379966, + "grad_norm": 0.004006498027592897, + "learning_rate": 0.04290605884443841, + "loss": 0.0194, + "num_input_tokens_seen": 27254608, + "step": 30125 + }, + { + "epoch": 7.952091856935462, + "grad_norm": 0.002142583951354027, + "learning_rate": 0.04286482269090545, + "loss": 0.0532, + "num_input_tokens_seen": 27259536, + "step": 30130 + }, + { + "epoch": 7.95341164049096, + "grad_norm": 0.001905589597299695, + "learning_rate": 0.04282360305896323, + "loss": 0.0238, + "num_input_tokens_seen": 27263728, + "step": 30135 + }, + { + "epoch": 7.954731424046456, + "grad_norm": 0.0029442464001476765, + "learning_rate": 0.04278239995496822, + "loss": 0.0164, + "num_input_tokens_seen": 27268336, + "step": 30140 + }, + { + "epoch": 7.956051207601953, + "grad_norm": 0.0016190578462556005, + "learning_rate": 0.042741213385274514, + "loss": 0.0197, + "num_input_tokens_seen": 27272976, + "step": 30145 + }, + { + "epoch": 7.95737099115745, + "grad_norm": 0.001703832414932549, + "learning_rate": 0.04270004335623366, + "loss": 0.0338, + "num_input_tokens_seen": 27277392, + "step": 30150 + }, + { + "epoch": 7.958690774712947, + "grad_norm": 0.0010584586998447776, + "learning_rate": 0.04265888987419448, + "loss": 0.0294, + "num_input_tokens_seen": 27281776, + "step": 30155 + }, + { + "epoch": 7.9600105582684435, + "grad_norm": 0.0029098265804350376, + "learning_rate": 0.04261775294550346, + "loss": 0.041, + "num_input_tokens_seen": 27286416, + "step": 30160 + }, + { + "epoch": 7.961330341823941, + "grad_norm": 0.003459265222772956, + "learning_rate": 0.042576632576504354, + "loss": 0.0228, + "num_input_tokens_seen": 27290736, + "step": 30165 + }, + { + "epoch": 7.9626501253794375, + "grad_norm": 0.004931815899908543, + "learning_rate": 0.0425355287735385, + "loss": 0.0386, + "num_input_tokens_seen": 27295248, + "step": 30170 + }, + { + "epoch": 7.963969908934935, + "grad_norm": 0.0024332290049642324, + "learning_rate": 0.0424944415429446, + "loss": 0.0384, + "num_input_tokens_seen": 27299856, + "step": 30175 + }, + { + "epoch": 7.9652896924904315, + "grad_norm": 0.0009010196663439274, + "learning_rate": 0.04245337089105877, + "loss": 0.0461, + "num_input_tokens_seen": 27304464, + "step": 30180 + }, + { + "epoch": 7.966609476045928, + "grad_norm": 0.003614619607105851, + "learning_rate": 0.04241231682421467, + "loss": 0.0592, + "num_input_tokens_seen": 27308816, + "step": 30185 + }, + { + "epoch": 7.9679292596014255, + "grad_norm": 0.0007382665062323213, + "learning_rate": 0.04237127934874337, + "loss": 0.0265, + "num_input_tokens_seen": 27313552, + "step": 30190 + }, + { + "epoch": 7.969249043156922, + "grad_norm": 0.0033406848087906837, + "learning_rate": 0.042330258470973305, + "loss": 0.0449, + "num_input_tokens_seen": 27318192, + "step": 30195 + }, + { + "epoch": 7.9705688267124195, + "grad_norm": 0.001395914820022881, + "learning_rate": 0.042289254197230515, + "loss": 0.0393, + "num_input_tokens_seen": 27322864, + "step": 30200 + }, + { + "epoch": 7.9705688267124195, + "eval_loss": 0.09297545999288559, + "eval_runtime": 75.9181, + "eval_samples_per_second": 88.714, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 27322864, + "step": 30200 + }, + { + "epoch": 7.971888610267916, + "grad_norm": 0.0017485065618529916, + "learning_rate": 0.04224826653383823, + "loss": 0.0356, + "num_input_tokens_seen": 27327792, + "step": 30205 + }, + { + "epoch": 7.973208393823413, + "grad_norm": 0.000500108115375042, + "learning_rate": 0.04220729548711735, + "loss": 0.0094, + "num_input_tokens_seen": 27332304, + "step": 30210 + }, + { + "epoch": 7.97452817737891, + "grad_norm": 0.0016160064842551947, + "learning_rate": 0.04216634106338616, + "loss": 0.0835, + "num_input_tokens_seen": 27336912, + "step": 30215 + }, + { + "epoch": 7.975847960934407, + "grad_norm": 0.002543529961258173, + "learning_rate": 0.04212540326896025, + "loss": 0.0358, + "num_input_tokens_seen": 27341072, + "step": 30220 + }, + { + "epoch": 7.977167744489904, + "grad_norm": 0.0012819156982004642, + "learning_rate": 0.0420844821101528, + "loss": 0.0725, + "num_input_tokens_seen": 27345648, + "step": 30225 + }, + { + "epoch": 7.978487528045401, + "grad_norm": 0.0008244309574365616, + "learning_rate": 0.04204357759327441, + "loss": 0.0281, + "num_input_tokens_seen": 27350064, + "step": 30230 + }, + { + "epoch": 7.979807311600897, + "grad_norm": 0.004496915265917778, + "learning_rate": 0.042002689724632954, + "loss": 0.0382, + "num_input_tokens_seen": 27354352, + "step": 30235 + }, + { + "epoch": 7.981127095156395, + "grad_norm": 0.0012913381215184927, + "learning_rate": 0.04196181851053398, + "loss": 0.0249, + "num_input_tokens_seen": 27358640, + "step": 30240 + }, + { + "epoch": 7.982446878711891, + "grad_norm": 0.0010736379772424698, + "learning_rate": 0.041920963957280295, + "loss": 0.022, + "num_input_tokens_seen": 27363344, + "step": 30245 + }, + { + "epoch": 7.983766662267389, + "grad_norm": 0.0021633116994053125, + "learning_rate": 0.04188012607117212, + "loss": 0.0493, + "num_input_tokens_seen": 27367728, + "step": 30250 + }, + { + "epoch": 7.985086445822885, + "grad_norm": 0.00272866222076118, + "learning_rate": 0.04183930485850725, + "loss": 0.0428, + "num_input_tokens_seen": 27372272, + "step": 30255 + }, + { + "epoch": 7.986406229378382, + "grad_norm": 0.002948412438854575, + "learning_rate": 0.04179850032558078, + "loss": 0.0126, + "num_input_tokens_seen": 27376944, + "step": 30260 + }, + { + "epoch": 7.987726012933879, + "grad_norm": 0.0006520614842884243, + "learning_rate": 0.041757712478685295, + "loss": 0.0243, + "num_input_tokens_seen": 27381744, + "step": 30265 + }, + { + "epoch": 7.989045796489376, + "grad_norm": 0.0015872425865381956, + "learning_rate": 0.04171694132411085, + "loss": 0.0275, + "num_input_tokens_seen": 27386064, + "step": 30270 + }, + { + "epoch": 7.990365580044872, + "grad_norm": 0.002620995743200183, + "learning_rate": 0.04167618686814479, + "loss": 0.0224, + "num_input_tokens_seen": 27390448, + "step": 30275 + }, + { + "epoch": 7.99168536360037, + "grad_norm": 0.003747528651729226, + "learning_rate": 0.041635449117072024, + "loss": 0.0144, + "num_input_tokens_seen": 27394800, + "step": 30280 + }, + { + "epoch": 7.993005147155866, + "grad_norm": 0.0012367375893518329, + "learning_rate": 0.04159472807717477, + "loss": 0.0167, + "num_input_tokens_seen": 27399312, + "step": 30285 + }, + { + "epoch": 7.994324930711363, + "grad_norm": 0.005079852417111397, + "learning_rate": 0.041554023754732744, + "loss": 0.0516, + "num_input_tokens_seen": 27403888, + "step": 30290 + }, + { + "epoch": 7.99564471426686, + "grad_norm": 0.0004842373891733587, + "learning_rate": 0.04151333615602311, + "loss": 0.0439, + "num_input_tokens_seen": 27408400, + "step": 30295 + }, + { + "epoch": 7.996964497822357, + "grad_norm": 0.0011323817307129502, + "learning_rate": 0.04147266528732034, + "loss": 0.0337, + "num_input_tokens_seen": 27412752, + "step": 30300 + }, + { + "epoch": 7.998284281377854, + "grad_norm": 0.0022377073764801025, + "learning_rate": 0.0414320111548964, + "loss": 0.0344, + "num_input_tokens_seen": 27417648, + "step": 30305 + }, + { + "epoch": 7.999604064933351, + "grad_norm": 0.0032963326666504145, + "learning_rate": 0.04139137376502076, + "loss": 0.0313, + "num_input_tokens_seen": 27421936, + "step": 30310 + }, + { + "epoch": 8.000791870133298, + "grad_norm": 0.000799052300862968, + "learning_rate": 0.04135075312396014, + "loss": 0.0698, + "num_input_tokens_seen": 27425600, + "step": 30315 + }, + { + "epoch": 8.002111653688795, + "grad_norm": 8.671712566865608e-05, + "learning_rate": 0.04131014923797875, + "loss": 0.0132, + "num_input_tokens_seen": 27430144, + "step": 30320 + }, + { + "epoch": 8.003431437244291, + "grad_norm": 0.0038245830219238997, + "learning_rate": 0.04126956211333819, + "loss": 0.0166, + "num_input_tokens_seen": 27434752, + "step": 30325 + }, + { + "epoch": 8.004751220799788, + "grad_norm": 0.003995254635810852, + "learning_rate": 0.041228991756297545, + "loss": 0.0385, + "num_input_tokens_seen": 27439360, + "step": 30330 + }, + { + "epoch": 8.006071004355286, + "grad_norm": 0.0018711852608248591, + "learning_rate": 0.04118843817311332, + "loss": 0.0267, + "num_input_tokens_seen": 27443584, + "step": 30335 + }, + { + "epoch": 8.007390787910783, + "grad_norm": 0.00033859332324936986, + "learning_rate": 0.0411479013700393, + "loss": 0.0177, + "num_input_tokens_seen": 27448288, + "step": 30340 + }, + { + "epoch": 8.00871057146628, + "grad_norm": 0.0013700427953153849, + "learning_rate": 0.0411073813533268, + "loss": 0.0365, + "num_input_tokens_seen": 27452576, + "step": 30345 + }, + { + "epoch": 8.010030355021776, + "grad_norm": 0.0004736970877274871, + "learning_rate": 0.04106687812922456, + "loss": 0.0191, + "num_input_tokens_seen": 27456992, + "step": 30350 + }, + { + "epoch": 8.011350138577273, + "grad_norm": 0.0026823643129318953, + "learning_rate": 0.041026391703978635, + "loss": 0.0162, + "num_input_tokens_seen": 27461568, + "step": 30355 + }, + { + "epoch": 8.01266992213277, + "grad_norm": 0.002237809356302023, + "learning_rate": 0.04098592208383259, + "loss": 0.0128, + "num_input_tokens_seen": 27466400, + "step": 30360 + }, + { + "epoch": 8.013989705688267, + "grad_norm": 0.000575520156417042, + "learning_rate": 0.040945469275027256, + "loss": 0.0171, + "num_input_tokens_seen": 27470720, + "step": 30365 + }, + { + "epoch": 8.015309489243764, + "grad_norm": 0.0012157190358266234, + "learning_rate": 0.04090503328380104, + "loss": 0.0295, + "num_input_tokens_seen": 27475104, + "step": 30370 + }, + { + "epoch": 8.01662927279926, + "grad_norm": 0.0015490979421883821, + "learning_rate": 0.04086461411638971, + "loss": 0.0235, + "num_input_tokens_seen": 27479616, + "step": 30375 + }, + { + "epoch": 8.017949056354757, + "grad_norm": 0.0017343424260616302, + "learning_rate": 0.04082421177902631, + "loss": 0.0284, + "num_input_tokens_seen": 27484448, + "step": 30380 + }, + { + "epoch": 8.019268839910255, + "grad_norm": 0.000548519950825721, + "learning_rate": 0.04078382627794149, + "loss": 0.025, + "num_input_tokens_seen": 27488672, + "step": 30385 + }, + { + "epoch": 8.020588623465752, + "grad_norm": 0.006327936425805092, + "learning_rate": 0.04074345761936316, + "loss": 0.0308, + "num_input_tokens_seen": 27493088, + "step": 30390 + }, + { + "epoch": 8.021908407021249, + "grad_norm": 0.0020206538029015064, + "learning_rate": 0.04070310580951663, + "loss": 0.0477, + "num_input_tokens_seen": 27497600, + "step": 30395 + }, + { + "epoch": 8.023228190576745, + "grad_norm": 0.0010151303140446544, + "learning_rate": 0.040662770854624726, + "loss": 0.0263, + "num_input_tokens_seen": 27502208, + "step": 30400 + }, + { + "epoch": 8.023228190576745, + "eval_loss": 0.09805966913700104, + "eval_runtime": 75.7921, + "eval_samples_per_second": 88.861, + "eval_steps_per_second": 22.219, + "num_input_tokens_seen": 27502208, + "step": 30400 + }, + { + "epoch": 8.024547974132242, + "grad_norm": 0.0029082391411066055, + "learning_rate": 0.040622452760907535, + "loss": 0.0219, + "num_input_tokens_seen": 27506464, + "step": 30405 + }, + { + "epoch": 8.02586775768774, + "grad_norm": 0.0024383082054555416, + "learning_rate": 0.04058215153458265, + "loss": 0.0276, + "num_input_tokens_seen": 27511008, + "step": 30410 + }, + { + "epoch": 8.027187541243237, + "grad_norm": 0.0005584284081123769, + "learning_rate": 0.04054186718186507, + "loss": 0.0058, + "num_input_tokens_seen": 27515840, + "step": 30415 + }, + { + "epoch": 8.028507324798733, + "grad_norm": 0.0010987123241648078, + "learning_rate": 0.04050159970896708, + "loss": 0.0195, + "num_input_tokens_seen": 27520256, + "step": 30420 + }, + { + "epoch": 8.02982710835423, + "grad_norm": 0.001831791247241199, + "learning_rate": 0.04046134912209843, + "loss": 0.0124, + "num_input_tokens_seen": 27524832, + "step": 30425 + }, + { + "epoch": 8.031146891909726, + "grad_norm": 0.0024513443931937218, + "learning_rate": 0.040421115427466354, + "loss": 0.0199, + "num_input_tokens_seen": 27529056, + "step": 30430 + }, + { + "epoch": 8.032466675465225, + "grad_norm": 0.0016491527203470469, + "learning_rate": 0.04038089863127529, + "loss": 0.0111, + "num_input_tokens_seen": 27533664, + "step": 30435 + }, + { + "epoch": 8.033786459020721, + "grad_norm": 0.00027761413366533816, + "learning_rate": 0.04034069873972727, + "loss": 0.0198, + "num_input_tokens_seen": 27538176, + "step": 30440 + }, + { + "epoch": 8.035106242576218, + "grad_norm": 0.0007431777776218951, + "learning_rate": 0.040300515759021514, + "loss": 0.0298, + "num_input_tokens_seen": 27542848, + "step": 30445 + }, + { + "epoch": 8.036426026131714, + "grad_norm": 0.0010231996420770884, + "learning_rate": 0.04026034969535478, + "loss": 0.0149, + "num_input_tokens_seen": 27547424, + "step": 30450 + }, + { + "epoch": 8.03774580968721, + "grad_norm": 0.0007517791818827391, + "learning_rate": 0.040220200554921266, + "loss": 0.0129, + "num_input_tokens_seen": 27551936, + "step": 30455 + }, + { + "epoch": 8.039065593242707, + "grad_norm": 0.0003187601105310023, + "learning_rate": 0.0401800683439124, + "loss": 0.0138, + "num_input_tokens_seen": 27556192, + "step": 30460 + }, + { + "epoch": 8.040385376798206, + "grad_norm": 0.0038423947989940643, + "learning_rate": 0.04013995306851704, + "loss": 0.0116, + "num_input_tokens_seen": 27560768, + "step": 30465 + }, + { + "epoch": 8.041705160353702, + "grad_norm": 0.0008120359270833433, + "learning_rate": 0.040099854734921545, + "loss": 0.0103, + "num_input_tokens_seen": 27565248, + "step": 30470 + }, + { + "epoch": 8.043024943909199, + "grad_norm": 0.003738576080650091, + "learning_rate": 0.0400597733493095, + "loss": 0.0321, + "num_input_tokens_seen": 27569472, + "step": 30475 + }, + { + "epoch": 8.044344727464695, + "grad_norm": 0.007473658304661512, + "learning_rate": 0.04001970891786203, + "loss": 0.0398, + "num_input_tokens_seen": 27574080, + "step": 30480 + }, + { + "epoch": 8.045664511020192, + "grad_norm": 0.0034866188652813435, + "learning_rate": 0.03997966144675752, + "loss": 0.0401, + "num_input_tokens_seen": 27578720, + "step": 30485 + }, + { + "epoch": 8.04698429457569, + "grad_norm": 0.0010081167565658689, + "learning_rate": 0.039939630942171796, + "loss": 0.0058, + "num_input_tokens_seen": 27583168, + "step": 30490 + }, + { + "epoch": 8.048304078131187, + "grad_norm": 0.0025032181292772293, + "learning_rate": 0.03989961741027815, + "loss": 0.0193, + "num_input_tokens_seen": 27587488, + "step": 30495 + }, + { + "epoch": 8.049623861686683, + "grad_norm": 0.001649549463763833, + "learning_rate": 0.03985962085724704, + "loss": 0.0143, + "num_input_tokens_seen": 27591936, + "step": 30500 + }, + { + "epoch": 8.05094364524218, + "grad_norm": 0.0005595517577603459, + "learning_rate": 0.03981964128924656, + "loss": 0.0266, + "num_input_tokens_seen": 27596544, + "step": 30505 + }, + { + "epoch": 8.052263428797676, + "grad_norm": 0.00035376966116018593, + "learning_rate": 0.03977967871244197, + "loss": 0.0325, + "num_input_tokens_seen": 27601408, + "step": 30510 + }, + { + "epoch": 8.053583212353175, + "grad_norm": 0.004580790176987648, + "learning_rate": 0.03973973313299602, + "loss": 0.0602, + "num_input_tokens_seen": 27606080, + "step": 30515 + }, + { + "epoch": 8.054902995908671, + "grad_norm": 0.0019453393761068583, + "learning_rate": 0.0396998045570689, + "loss": 0.061, + "num_input_tokens_seen": 27610688, + "step": 30520 + }, + { + "epoch": 8.056222779464168, + "grad_norm": 0.0018438845872879028, + "learning_rate": 0.03965989299081798, + "loss": 0.0155, + "num_input_tokens_seen": 27615392, + "step": 30525 + }, + { + "epoch": 8.057542563019664, + "grad_norm": 0.002894786885008216, + "learning_rate": 0.039619998440398235, + "loss": 0.0233, + "num_input_tokens_seen": 27619680, + "step": 30530 + }, + { + "epoch": 8.058862346575161, + "grad_norm": 0.0013786198105663061, + "learning_rate": 0.03958012091196184, + "loss": 0.0329, + "num_input_tokens_seen": 27624320, + "step": 30535 + }, + { + "epoch": 8.06018213013066, + "grad_norm": 0.0021194317378103733, + "learning_rate": 0.039540260411658396, + "loss": 0.0181, + "num_input_tokens_seen": 27628544, + "step": 30540 + }, + { + "epoch": 8.061501913686156, + "grad_norm": 0.006787663791328669, + "learning_rate": 0.03950041694563496, + "loss": 0.0588, + "num_input_tokens_seen": 27632864, + "step": 30545 + }, + { + "epoch": 8.062821697241652, + "grad_norm": 0.0014261818723753095, + "learning_rate": 0.0394605905200358, + "loss": 0.0162, + "num_input_tokens_seen": 27637888, + "step": 30550 + }, + { + "epoch": 8.064141480797149, + "grad_norm": 0.0008329079137183726, + "learning_rate": 0.03942078114100272, + "loss": 0.0336, + "num_input_tokens_seen": 27642496, + "step": 30555 + }, + { + "epoch": 8.065461264352646, + "grad_norm": 0.0020678932778537273, + "learning_rate": 0.03938098881467485, + "loss": 0.0191, + "num_input_tokens_seen": 27647168, + "step": 30560 + }, + { + "epoch": 8.066781047908144, + "grad_norm": 0.0005318881594575942, + "learning_rate": 0.039341213547188586, + "loss": 0.0157, + "num_input_tokens_seen": 27651840, + "step": 30565 + }, + { + "epoch": 8.06810083146364, + "grad_norm": 0.0017630220390856266, + "learning_rate": 0.03930145534467782, + "loss": 0.0078, + "num_input_tokens_seen": 27656384, + "step": 30570 + }, + { + "epoch": 8.069420615019137, + "grad_norm": 0.0034617730416357517, + "learning_rate": 0.0392617142132738, + "loss": 0.0176, + "num_input_tokens_seen": 27660992, + "step": 30575 + }, + { + "epoch": 8.070740398574634, + "grad_norm": 0.0005800786311738193, + "learning_rate": 0.03922199015910504, + "loss": 0.0125, + "num_input_tokens_seen": 27665376, + "step": 30580 + }, + { + "epoch": 8.07206018213013, + "grad_norm": 0.002275154460221529, + "learning_rate": 0.039182283188297556, + "loss": 0.0284, + "num_input_tokens_seen": 27669760, + "step": 30585 + }, + { + "epoch": 8.073379965685628, + "grad_norm": 0.0003862351586576551, + "learning_rate": 0.039142593306974595, + "loss": 0.0232, + "num_input_tokens_seen": 27674336, + "step": 30590 + }, + { + "epoch": 8.074699749241125, + "grad_norm": 0.002722149481996894, + "learning_rate": 0.039102920521256856, + "loss": 0.0237, + "num_input_tokens_seen": 27678944, + "step": 30595 + }, + { + "epoch": 8.076019532796622, + "grad_norm": 0.000277863786322996, + "learning_rate": 0.03906326483726243, + "loss": 0.0355, + "num_input_tokens_seen": 27683072, + "step": 30600 + }, + { + "epoch": 8.076019532796622, + "eval_loss": 0.10162412375211716, + "eval_runtime": 75.8174, + "eval_samples_per_second": 88.832, + "eval_steps_per_second": 22.211, + "num_input_tokens_seen": 27683072, + "step": 30600 + }, + { + "epoch": 8.077339316352118, + "grad_norm": 0.002043369458988309, + "learning_rate": 0.039023626261106704, + "loss": 0.0188, + "num_input_tokens_seen": 27687552, + "step": 30605 + }, + { + "epoch": 8.078659099907615, + "grad_norm": 0.0030173601116985083, + "learning_rate": 0.03898400479890237, + "loss": 0.0123, + "num_input_tokens_seen": 27692288, + "step": 30610 + }, + { + "epoch": 8.079978883463111, + "grad_norm": 0.0003658193745650351, + "learning_rate": 0.038944400456759655, + "loss": 0.049, + "num_input_tokens_seen": 27696928, + "step": 30615 + }, + { + "epoch": 8.08129866701861, + "grad_norm": 0.004577195271849632, + "learning_rate": 0.038904813240785964, + "loss": 0.0328, + "num_input_tokens_seen": 27701440, + "step": 30620 + }, + { + "epoch": 8.082618450574106, + "grad_norm": 0.0008913431083783507, + "learning_rate": 0.03886524315708621, + "loss": 0.0316, + "num_input_tokens_seen": 27705952, + "step": 30625 + }, + { + "epoch": 8.083938234129603, + "grad_norm": 0.0011987132020294666, + "learning_rate": 0.03882569021176255, + "loss": 0.0243, + "num_input_tokens_seen": 27710304, + "step": 30630 + }, + { + "epoch": 8.0852580176851, + "grad_norm": 0.001703336020000279, + "learning_rate": 0.038786154410914535, + "loss": 0.0126, + "num_input_tokens_seen": 27714848, + "step": 30635 + }, + { + "epoch": 8.086577801240596, + "grad_norm": 0.004783847369253635, + "learning_rate": 0.03874663576063917, + "loss": 0.0135, + "num_input_tokens_seen": 27719520, + "step": 30640 + }, + { + "epoch": 8.087897584796094, + "grad_norm": 0.0010184594430029392, + "learning_rate": 0.038707134267030624, + "loss": 0.012, + "num_input_tokens_seen": 27724096, + "step": 30645 + }, + { + "epoch": 8.08921736835159, + "grad_norm": 0.002977854572236538, + "learning_rate": 0.038667649936180555, + "loss": 0.0665, + "num_input_tokens_seen": 27728768, + "step": 30650 + }, + { + "epoch": 8.090537151907087, + "grad_norm": 0.0001649848563829437, + "learning_rate": 0.038628182774178, + "loss": 0.0282, + "num_input_tokens_seen": 27733344, + "step": 30655 + }, + { + "epoch": 8.091856935462584, + "grad_norm": 0.0007981599774211645, + "learning_rate": 0.038588732787109226, + "loss": 0.0044, + "num_input_tokens_seen": 27738176, + "step": 30660 + }, + { + "epoch": 8.09317671901808, + "grad_norm": 0.0027105642948299646, + "learning_rate": 0.03854929998105795, + "loss": 0.0232, + "num_input_tokens_seen": 27742752, + "step": 30665 + }, + { + "epoch": 8.094496502573579, + "grad_norm": 0.0014139279955998063, + "learning_rate": 0.03850988436210518, + "loss": 0.0295, + "num_input_tokens_seen": 27747392, + "step": 30670 + }, + { + "epoch": 8.095816286129075, + "grad_norm": 0.0035106856375932693, + "learning_rate": 0.03847048593632933, + "loss": 0.0376, + "num_input_tokens_seen": 27752096, + "step": 30675 + }, + { + "epoch": 8.097136069684572, + "grad_norm": 0.0006969820242375135, + "learning_rate": 0.038431104709806096, + "loss": 0.0248, + "num_input_tokens_seen": 27756448, + "step": 30680 + }, + { + "epoch": 8.098455853240068, + "grad_norm": 0.0028912480920553207, + "learning_rate": 0.0383917406886086, + "loss": 0.0372, + "num_input_tokens_seen": 27760672, + "step": 30685 + }, + { + "epoch": 8.099775636795565, + "grad_norm": 0.0004995826748199761, + "learning_rate": 0.03835239387880722, + "loss": 0.0088, + "num_input_tokens_seen": 27765344, + "step": 30690 + }, + { + "epoch": 8.101095420351063, + "grad_norm": 0.0023694115225225687, + "learning_rate": 0.03831306428646979, + "loss": 0.0263, + "num_input_tokens_seen": 27770080, + "step": 30695 + }, + { + "epoch": 8.10241520390656, + "grad_norm": 0.00029673048993572593, + "learning_rate": 0.03827375191766135, + "loss": 0.0123, + "num_input_tokens_seen": 27774560, + "step": 30700 + }, + { + "epoch": 8.103734987462056, + "grad_norm": 0.0011331280693411827, + "learning_rate": 0.03823445677844446, + "loss": 0.0199, + "num_input_tokens_seen": 27779040, + "step": 30705 + }, + { + "epoch": 8.105054771017553, + "grad_norm": 0.00224398635327816, + "learning_rate": 0.03819517887487881, + "loss": 0.0155, + "num_input_tokens_seen": 27783680, + "step": 30710 + }, + { + "epoch": 8.10637455457305, + "grad_norm": 0.0007108478457666934, + "learning_rate": 0.03815591821302161, + "loss": 0.0118, + "num_input_tokens_seen": 27788032, + "step": 30715 + }, + { + "epoch": 8.107694338128548, + "grad_norm": 0.00017126715101767331, + "learning_rate": 0.03811667479892739, + "loss": 0.011, + "num_input_tokens_seen": 27792352, + "step": 30720 + }, + { + "epoch": 8.109014121684044, + "grad_norm": 0.0005061959382146597, + "learning_rate": 0.03807744863864788, + "loss": 0.0327, + "num_input_tokens_seen": 27796928, + "step": 30725 + }, + { + "epoch": 8.110333905239541, + "grad_norm": 0.0005757003091275692, + "learning_rate": 0.03803823973823229, + "loss": 0.0378, + "num_input_tokens_seen": 27801408, + "step": 30730 + }, + { + "epoch": 8.111653688795037, + "grad_norm": 0.0005829838919453323, + "learning_rate": 0.03799904810372719, + "loss": 0.0207, + "num_input_tokens_seen": 27805952, + "step": 30735 + }, + { + "epoch": 8.112973472350534, + "grad_norm": 0.00035607864265330136, + "learning_rate": 0.03795987374117632, + "loss": 0.0282, + "num_input_tokens_seen": 27810240, + "step": 30740 + }, + { + "epoch": 8.11429325590603, + "grad_norm": 0.003070784267038107, + "learning_rate": 0.03792071665662093, + "loss": 0.0321, + "num_input_tokens_seen": 27814816, + "step": 30745 + }, + { + "epoch": 8.115613039461529, + "grad_norm": 0.0017600716091692448, + "learning_rate": 0.03788157685609952, + "loss": 0.0178, + "num_input_tokens_seen": 27819840, + "step": 30750 + }, + { + "epoch": 8.116932823017025, + "grad_norm": 0.003667286131531, + "learning_rate": 0.037842454345647876, + "loss": 0.0181, + "num_input_tokens_seen": 27824160, + "step": 30755 + }, + { + "epoch": 8.118252606572522, + "grad_norm": 0.0009819287806749344, + "learning_rate": 0.03780334913129929, + "loss": 0.025, + "num_input_tokens_seen": 27828608, + "step": 30760 + }, + { + "epoch": 8.119572390128019, + "grad_norm": 0.000652976450510323, + "learning_rate": 0.037764261219084175, + "loss": 0.012, + "num_input_tokens_seen": 27833184, + "step": 30765 + }, + { + "epoch": 8.120892173683515, + "grad_norm": 0.0043371254578232765, + "learning_rate": 0.037725190615030414, + "loss": 0.0325, + "num_input_tokens_seen": 27837728, + "step": 30770 + }, + { + "epoch": 8.122211957239013, + "grad_norm": 0.000865710957441479, + "learning_rate": 0.037686137325163224, + "loss": 0.0181, + "num_input_tokens_seen": 27842336, + "step": 30775 + }, + { + "epoch": 8.12353174079451, + "grad_norm": 0.0060622598975896835, + "learning_rate": 0.037647101355505065, + "loss": 0.059, + "num_input_tokens_seen": 27846976, + "step": 30780 + }, + { + "epoch": 8.124851524350007, + "grad_norm": 0.003517898265272379, + "learning_rate": 0.03760808271207581, + "loss": 0.0328, + "num_input_tokens_seen": 27851744, + "step": 30785 + }, + { + "epoch": 8.126171307905503, + "grad_norm": 0.000581293657887727, + "learning_rate": 0.03756908140089258, + "loss": 0.0184, + "num_input_tokens_seen": 27855968, + "step": 30790 + }, + { + "epoch": 8.127491091461, + "grad_norm": 0.0031419554725289345, + "learning_rate": 0.03753009742796989, + "loss": 0.0303, + "num_input_tokens_seen": 27860160, + "step": 30795 + }, + { + "epoch": 8.128810875016498, + "grad_norm": 0.0019322952721267939, + "learning_rate": 0.037491130799319615, + "loss": 0.0093, + "num_input_tokens_seen": 27864736, + "step": 30800 + }, + { + "epoch": 8.128810875016498, + "eval_loss": 0.1000254675745964, + "eval_runtime": 75.8528, + "eval_samples_per_second": 88.79, + "eval_steps_per_second": 22.201, + "num_input_tokens_seen": 27864736, + "step": 30800 + }, + { + "epoch": 8.130130658571995, + "grad_norm": 0.002598476829007268, + "learning_rate": 0.03745218152095079, + "loss": 0.0155, + "num_input_tokens_seen": 27869024, + "step": 30805 + }, + { + "epoch": 8.131450442127491, + "grad_norm": 0.0003596097812987864, + "learning_rate": 0.037413249598869935, + "loss": 0.0169, + "num_input_tokens_seen": 27873248, + "step": 30810 + }, + { + "epoch": 8.132770225682988, + "grad_norm": 0.0005366475088521838, + "learning_rate": 0.037374335039080886, + "loss": 0.0229, + "num_input_tokens_seen": 27878048, + "step": 30815 + }, + { + "epoch": 8.134090009238484, + "grad_norm": 0.0008490142645314336, + "learning_rate": 0.037335437847584724, + "loss": 0.0215, + "num_input_tokens_seen": 27882528, + "step": 30820 + }, + { + "epoch": 8.135409792793983, + "grad_norm": 0.0013036688324064016, + "learning_rate": 0.03729655803037983, + "loss": 0.0286, + "num_input_tokens_seen": 27887008, + "step": 30825 + }, + { + "epoch": 8.13672957634948, + "grad_norm": 0.0019860772881656885, + "learning_rate": 0.03725769559346207, + "loss": 0.0268, + "num_input_tokens_seen": 27891296, + "step": 30830 + }, + { + "epoch": 8.138049359904976, + "grad_norm": 9.619764750823379e-05, + "learning_rate": 0.03721885054282439, + "loss": 0.0157, + "num_input_tokens_seen": 27896000, + "step": 30835 + }, + { + "epoch": 8.139369143460472, + "grad_norm": 0.00011845912376884371, + "learning_rate": 0.03718002288445731, + "loss": 0.0176, + "num_input_tokens_seen": 27900352, + "step": 30840 + }, + { + "epoch": 8.140688927015969, + "grad_norm": 0.0006338792736642063, + "learning_rate": 0.03714121262434844, + "loss": 0.0338, + "num_input_tokens_seen": 27904832, + "step": 30845 + }, + { + "epoch": 8.142008710571467, + "grad_norm": 0.0007098709465935826, + "learning_rate": 0.037102419768482844, + "loss": 0.0117, + "num_input_tokens_seen": 27909312, + "step": 30850 + }, + { + "epoch": 8.143328494126964, + "grad_norm": 0.00034325505839660764, + "learning_rate": 0.03706364432284293, + "loss": 0.0296, + "num_input_tokens_seen": 27914112, + "step": 30855 + }, + { + "epoch": 8.14464827768246, + "grad_norm": 0.004871363751590252, + "learning_rate": 0.03702488629340828, + "loss": 0.0151, + "num_input_tokens_seen": 27918688, + "step": 30860 + }, + { + "epoch": 8.145968061237957, + "grad_norm": 0.0005323388031683862, + "learning_rate": 0.036986145686155915, + "loss": 0.0235, + "num_input_tokens_seen": 27923168, + "step": 30865 + }, + { + "epoch": 8.147287844793453, + "grad_norm": 0.005188556853681803, + "learning_rate": 0.036947422507060075, + "loss": 0.0274, + "num_input_tokens_seen": 27927584, + "step": 30870 + }, + { + "epoch": 8.14860762834895, + "grad_norm": 0.0011156396940350533, + "learning_rate": 0.0369087167620924, + "loss": 0.0128, + "num_input_tokens_seen": 27931968, + "step": 30875 + }, + { + "epoch": 8.149927411904448, + "grad_norm": 0.00540150748565793, + "learning_rate": 0.03687002845722183, + "loss": 0.0418, + "num_input_tokens_seen": 27936288, + "step": 30880 + }, + { + "epoch": 8.151247195459945, + "grad_norm": 0.0013283853186294436, + "learning_rate": 0.03683135759841451, + "loss": 0.0085, + "num_input_tokens_seen": 27940864, + "step": 30885 + }, + { + "epoch": 8.152566979015441, + "grad_norm": 0.0009421256254427135, + "learning_rate": 0.03679270419163406, + "loss": 0.0083, + "num_input_tokens_seen": 27945600, + "step": 30890 + }, + { + "epoch": 8.153886762570938, + "grad_norm": 0.0009931927779689431, + "learning_rate": 0.03675406824284127, + "loss": 0.0263, + "num_input_tokens_seen": 27949984, + "step": 30895 + }, + { + "epoch": 8.155206546126434, + "grad_norm": 0.0054174382239580154, + "learning_rate": 0.03671544975799425, + "loss": 0.067, + "num_input_tokens_seen": 27954304, + "step": 30900 + }, + { + "epoch": 8.156526329681933, + "grad_norm": 0.00010856413427973166, + "learning_rate": 0.03667684874304854, + "loss": 0.0246, + "num_input_tokens_seen": 27958848, + "step": 30905 + }, + { + "epoch": 8.15784611323743, + "grad_norm": 0.00150249432772398, + "learning_rate": 0.03663826520395683, + "loss": 0.021, + "num_input_tokens_seen": 27963296, + "step": 30910 + }, + { + "epoch": 8.159165896792926, + "grad_norm": 0.0013572816969826818, + "learning_rate": 0.03659969914666922, + "loss": 0.0131, + "num_input_tokens_seen": 27967904, + "step": 30915 + }, + { + "epoch": 8.160485680348422, + "grad_norm": 0.000382676487788558, + "learning_rate": 0.036561150577133106, + "loss": 0.0086, + "num_input_tokens_seen": 27972160, + "step": 30920 + }, + { + "epoch": 8.161805463903919, + "grad_norm": 0.0013926639221608639, + "learning_rate": 0.036522619501293103, + "loss": 0.0199, + "num_input_tokens_seen": 27976544, + "step": 30925 + }, + { + "epoch": 8.163125247459417, + "grad_norm": 0.002695071045309305, + "learning_rate": 0.03648410592509122, + "loss": 0.0337, + "num_input_tokens_seen": 27981152, + "step": 30930 + }, + { + "epoch": 8.164445031014914, + "grad_norm": 0.001490918337367475, + "learning_rate": 0.03644560985446676, + "loss": 0.0377, + "num_input_tokens_seen": 27985856, + "step": 30935 + }, + { + "epoch": 8.16576481457041, + "grad_norm": 0.0003960692847613245, + "learning_rate": 0.036407131295356256, + "loss": 0.0198, + "num_input_tokens_seen": 27990240, + "step": 30940 + }, + { + "epoch": 8.167084598125907, + "grad_norm": 0.0006873577367514372, + "learning_rate": 0.03636867025369362, + "loss": 0.0275, + "num_input_tokens_seen": 27994624, + "step": 30945 + }, + { + "epoch": 8.168404381681404, + "grad_norm": 0.0011040173703804612, + "learning_rate": 0.03633022673540999, + "loss": 0.0296, + "num_input_tokens_seen": 27999168, + "step": 30950 + }, + { + "epoch": 8.169724165236902, + "grad_norm": 0.004480937495827675, + "learning_rate": 0.03629180074643385, + "loss": 0.038, + "num_input_tokens_seen": 28003872, + "step": 30955 + }, + { + "epoch": 8.171043948792398, + "grad_norm": 0.0009747516014613211, + "learning_rate": 0.03625339229269102, + "loss": 0.0327, + "num_input_tokens_seen": 28008416, + "step": 30960 + }, + { + "epoch": 8.172363732347895, + "grad_norm": 0.0015130892861634493, + "learning_rate": 0.036215001380104535, + "loss": 0.0129, + "num_input_tokens_seen": 28012992, + "step": 30965 + }, + { + "epoch": 8.173683515903392, + "grad_norm": 0.00048192497342824936, + "learning_rate": 0.03617662801459471, + "loss": 0.0051, + "num_input_tokens_seen": 28017344, + "step": 30970 + }, + { + "epoch": 8.175003299458888, + "grad_norm": 0.0028554147575050592, + "learning_rate": 0.036138272202079276, + "loss": 0.0325, + "num_input_tokens_seen": 28021824, + "step": 30975 + }, + { + "epoch": 8.176323083014386, + "grad_norm": 0.002619448583573103, + "learning_rate": 0.036099933948473106, + "loss": 0.0073, + "num_input_tokens_seen": 28026176, + "step": 30980 + }, + { + "epoch": 8.177642866569883, + "grad_norm": 0.0030537815764546394, + "learning_rate": 0.03606161325968851, + "loss": 0.0313, + "num_input_tokens_seen": 28030752, + "step": 30985 + }, + { + "epoch": 8.17896265012538, + "grad_norm": 0.003196166129782796, + "learning_rate": 0.03602331014163496, + "loss": 0.0258, + "num_input_tokens_seen": 28035488, + "step": 30990 + }, + { + "epoch": 8.180282433680876, + "grad_norm": 0.002588817849755287, + "learning_rate": 0.035985024600219295, + "loss": 0.0239, + "num_input_tokens_seen": 28040160, + "step": 30995 + }, + { + "epoch": 8.181602217236373, + "grad_norm": 0.0038030839059501886, + "learning_rate": 0.03594675664134569, + "loss": 0.0177, + "num_input_tokens_seen": 28044640, + "step": 31000 + }, + { + "epoch": 8.181602217236373, + "eval_loss": 0.10283501446247101, + "eval_runtime": 75.8547, + "eval_samples_per_second": 88.788, + "eval_steps_per_second": 22.2, + "num_input_tokens_seen": 28044640, + "step": 31000 + }, + { + "epoch": 8.18292200079187, + "grad_norm": 0.004272895399481058, + "learning_rate": 0.03590850627091545, + "loss": 0.0301, + "num_input_tokens_seen": 28049216, + "step": 31005 + }, + { + "epoch": 8.184241784347368, + "grad_norm": 0.0008668527589179575, + "learning_rate": 0.03587027349482731, + "loss": 0.0364, + "num_input_tokens_seen": 28053728, + "step": 31010 + }, + { + "epoch": 8.185561567902864, + "grad_norm": 0.0018996060825884342, + "learning_rate": 0.035832058318977275, + "loss": 0.0321, + "num_input_tokens_seen": 28058304, + "step": 31015 + }, + { + "epoch": 8.18688135145836, + "grad_norm": 0.005200002808123827, + "learning_rate": 0.03579386074925853, + "loss": 0.0311, + "num_input_tokens_seen": 28063040, + "step": 31020 + }, + { + "epoch": 8.188201135013857, + "grad_norm": 0.0003875707625411451, + "learning_rate": 0.035755680791561696, + "loss": 0.0147, + "num_input_tokens_seen": 28067808, + "step": 31025 + }, + { + "epoch": 8.189520918569354, + "grad_norm": 0.002878437750041485, + "learning_rate": 0.03571751845177454, + "loss": 0.0171, + "num_input_tokens_seen": 28072160, + "step": 31030 + }, + { + "epoch": 8.190840702124852, + "grad_norm": 0.001320317736826837, + "learning_rate": 0.03567937373578225, + "loss": 0.0163, + "num_input_tokens_seen": 28076928, + "step": 31035 + }, + { + "epoch": 8.192160485680349, + "grad_norm": 0.0033071928191930056, + "learning_rate": 0.03564124664946711, + "loss": 0.0357, + "num_input_tokens_seen": 28081504, + "step": 31040 + }, + { + "epoch": 8.193480269235845, + "grad_norm": 0.0004957421915605664, + "learning_rate": 0.035603137198708924, + "loss": 0.0184, + "num_input_tokens_seen": 28085984, + "step": 31045 + }, + { + "epoch": 8.194800052791342, + "grad_norm": 0.0026002642698585987, + "learning_rate": 0.035565045389384514, + "loss": 0.0161, + "num_input_tokens_seen": 28090592, + "step": 31050 + }, + { + "epoch": 8.196119836346838, + "grad_norm": 0.00023825824609957635, + "learning_rate": 0.03552697122736823, + "loss": 0.009, + "num_input_tokens_seen": 28095264, + "step": 31055 + }, + { + "epoch": 8.197439619902337, + "grad_norm": 0.0028661591932177544, + "learning_rate": 0.03548891471853153, + "loss": 0.0209, + "num_input_tokens_seen": 28099680, + "step": 31060 + }, + { + "epoch": 8.198759403457833, + "grad_norm": 0.0002684770443011075, + "learning_rate": 0.03545087586874322, + "loss": 0.0065, + "num_input_tokens_seen": 28104320, + "step": 31065 + }, + { + "epoch": 8.20007918701333, + "grad_norm": 0.0012442077277228236, + "learning_rate": 0.03541285468386935, + "loss": 0.036, + "num_input_tokens_seen": 28108576, + "step": 31070 + }, + { + "epoch": 8.201398970568826, + "grad_norm": 0.0007986663258634508, + "learning_rate": 0.03537485116977327, + "loss": 0.0146, + "num_input_tokens_seen": 28113216, + "step": 31075 + }, + { + "epoch": 8.202718754124323, + "grad_norm": 0.0028199690859764814, + "learning_rate": 0.03533686533231565, + "loss": 0.0208, + "num_input_tokens_seen": 28117696, + "step": 31080 + }, + { + "epoch": 8.204038537679821, + "grad_norm": 0.002753922250121832, + "learning_rate": 0.0352988971773543, + "loss": 0.0274, + "num_input_tokens_seen": 28121952, + "step": 31085 + }, + { + "epoch": 8.205358321235318, + "grad_norm": 0.005267021246254444, + "learning_rate": 0.03526094671074443, + "loss": 0.0267, + "num_input_tokens_seen": 28126752, + "step": 31090 + }, + { + "epoch": 8.206678104790814, + "grad_norm": 0.008477134630084038, + "learning_rate": 0.03522301393833852, + "loss": 0.0176, + "num_input_tokens_seen": 28131040, + "step": 31095 + }, + { + "epoch": 8.207997888346311, + "grad_norm": 0.004156159702688456, + "learning_rate": 0.035185098865986204, + "loss": 0.0547, + "num_input_tokens_seen": 28135360, + "step": 31100 + }, + { + "epoch": 8.209317671901808, + "grad_norm": 0.0014771893620491028, + "learning_rate": 0.03514720149953453, + "loss": 0.0219, + "num_input_tokens_seen": 28139680, + "step": 31105 + }, + { + "epoch": 8.210637455457306, + "grad_norm": 0.0006373871001414955, + "learning_rate": 0.03510932184482773, + "loss": 0.0481, + "num_input_tokens_seen": 28144512, + "step": 31110 + }, + { + "epoch": 8.211957239012802, + "grad_norm": 0.0001297996350331232, + "learning_rate": 0.03507145990770724, + "loss": 0.019, + "num_input_tokens_seen": 28149024, + "step": 31115 + }, + { + "epoch": 8.213277022568299, + "grad_norm": 0.0008743223734200001, + "learning_rate": 0.035033615694011984, + "loss": 0.0215, + "num_input_tokens_seen": 28153728, + "step": 31120 + }, + { + "epoch": 8.214596806123796, + "grad_norm": 0.003237613011151552, + "learning_rate": 0.03499578920957788, + "loss": 0.0229, + "num_input_tokens_seen": 28158432, + "step": 31125 + }, + { + "epoch": 8.215916589679292, + "grad_norm": 0.0018316922942176461, + "learning_rate": 0.034957980460238375, + "loss": 0.0302, + "num_input_tokens_seen": 28162944, + "step": 31130 + }, + { + "epoch": 8.21723637323479, + "grad_norm": 0.0010320426663383842, + "learning_rate": 0.03492018945182393, + "loss": 0.0105, + "num_input_tokens_seen": 28167488, + "step": 31135 + }, + { + "epoch": 8.218556156790287, + "grad_norm": 0.0007440464105457067, + "learning_rate": 0.03488241619016247, + "loss": 0.0154, + "num_input_tokens_seen": 28172000, + "step": 31140 + }, + { + "epoch": 8.219875940345784, + "grad_norm": 0.002290052128955722, + "learning_rate": 0.03484466068107913, + "loss": 0.0121, + "num_input_tokens_seen": 28176448, + "step": 31145 + }, + { + "epoch": 8.22119572390128, + "grad_norm": 0.0018126574577763677, + "learning_rate": 0.034806922930396195, + "loss": 0.0112, + "num_input_tokens_seen": 28180736, + "step": 31150 + }, + { + "epoch": 8.222515507456777, + "grad_norm": 0.0025818480644375086, + "learning_rate": 0.03476920294393337, + "loss": 0.0415, + "num_input_tokens_seen": 28185184, + "step": 31155 + }, + { + "epoch": 8.223835291012273, + "grad_norm": 0.0038529369048774242, + "learning_rate": 0.03473150072750755, + "loss": 0.0282, + "num_input_tokens_seen": 28189856, + "step": 31160 + }, + { + "epoch": 8.225155074567772, + "grad_norm": 0.0003263155522290617, + "learning_rate": 0.03469381628693284, + "loss": 0.02, + "num_input_tokens_seen": 28194272, + "step": 31165 + }, + { + "epoch": 8.226474858123268, + "grad_norm": 0.0015331122558563948, + "learning_rate": 0.03465614962802072, + "loss": 0.0141, + "num_input_tokens_seen": 28198816, + "step": 31170 + }, + { + "epoch": 8.227794641678765, + "grad_norm": 0.001365587580949068, + "learning_rate": 0.0346185007565798, + "loss": 0.0289, + "num_input_tokens_seen": 28203680, + "step": 31175 + }, + { + "epoch": 8.229114425234261, + "grad_norm": 0.0006273715407587588, + "learning_rate": 0.03458086967841609, + "loss": 0.0034, + "num_input_tokens_seen": 28208448, + "step": 31180 + }, + { + "epoch": 8.230434208789758, + "grad_norm": 0.00036080897552892566, + "learning_rate": 0.03454325639933266, + "loss": 0.0125, + "num_input_tokens_seen": 28213152, + "step": 31185 + }, + { + "epoch": 8.231753992345256, + "grad_norm": 0.002122787991538644, + "learning_rate": 0.03450566092513007, + "loss": 0.0275, + "num_input_tokens_seen": 28217440, + "step": 31190 + }, + { + "epoch": 8.233073775900753, + "grad_norm": 0.0023745023645460606, + "learning_rate": 0.034468083261605914, + "loss": 0.0163, + "num_input_tokens_seen": 28221664, + "step": 31195 + }, + { + "epoch": 8.23439355945625, + "grad_norm": 0.002256014384329319, + "learning_rate": 0.03443052341455522, + "loss": 0.0156, + "num_input_tokens_seen": 28226016, + "step": 31200 + }, + { + "epoch": 8.23439355945625, + "eval_loss": 0.10451336950063705, + "eval_runtime": 75.8759, + "eval_samples_per_second": 88.763, + "eval_steps_per_second": 22.194, + "num_input_tokens_seen": 28226016, + "step": 31200 + }, + { + "epoch": 8.235713343011746, + "grad_norm": 0.0030942270532250404, + "learning_rate": 0.0343929813897701, + "loss": 0.0264, + "num_input_tokens_seen": 28230400, + "step": 31205 + }, + { + "epoch": 8.237033126567242, + "grad_norm": 0.0013886167434975505, + "learning_rate": 0.034355457193040125, + "loss": 0.0134, + "num_input_tokens_seen": 28234848, + "step": 31210 + }, + { + "epoch": 8.23835291012274, + "grad_norm": 0.0012616689782589674, + "learning_rate": 0.03431795083015186, + "loss": 0.0174, + "num_input_tokens_seen": 28239136, + "step": 31215 + }, + { + "epoch": 8.239672693678237, + "grad_norm": 0.0009161115158349276, + "learning_rate": 0.03428046230688936, + "loss": 0.0125, + "num_input_tokens_seen": 28243616, + "step": 31220 + }, + { + "epoch": 8.240992477233734, + "grad_norm": 0.0021253551822155714, + "learning_rate": 0.034242991629033805, + "loss": 0.0232, + "num_input_tokens_seen": 28247808, + "step": 31225 + }, + { + "epoch": 8.24231226078923, + "grad_norm": 0.00031800891156308353, + "learning_rate": 0.03420553880236362, + "loss": 0.0199, + "num_input_tokens_seen": 28252032, + "step": 31230 + }, + { + "epoch": 8.243632044344727, + "grad_norm": 0.00013753960956819355, + "learning_rate": 0.03416810383265449, + "loss": 0.0338, + "num_input_tokens_seen": 28256640, + "step": 31235 + }, + { + "epoch": 8.244951827900225, + "grad_norm": 0.0032142726704478264, + "learning_rate": 0.03413068672567944, + "loss": 0.0162, + "num_input_tokens_seen": 28261248, + "step": 31240 + }, + { + "epoch": 8.246271611455722, + "grad_norm": 0.0021431748755276203, + "learning_rate": 0.034093287487208565, + "loss": 0.0119, + "num_input_tokens_seen": 28265632, + "step": 31245 + }, + { + "epoch": 8.247591395011218, + "grad_norm": 0.0005092740175314248, + "learning_rate": 0.03405590612300937, + "loss": 0.0098, + "num_input_tokens_seen": 28270016, + "step": 31250 + }, + { + "epoch": 8.248911178566715, + "grad_norm": 0.001963397953659296, + "learning_rate": 0.03401854263884646, + "loss": 0.0246, + "num_input_tokens_seen": 28274752, + "step": 31255 + }, + { + "epoch": 8.250230962122211, + "grad_norm": 0.004177106078714132, + "learning_rate": 0.033981197040481824, + "loss": 0.0265, + "num_input_tokens_seen": 28279328, + "step": 31260 + }, + { + "epoch": 8.251550745677708, + "grad_norm": 0.005557356867939234, + "learning_rate": 0.03394386933367459, + "loss": 0.0221, + "num_input_tokens_seen": 28284000, + "step": 31265 + }, + { + "epoch": 8.252870529233206, + "grad_norm": 0.00043020304292440414, + "learning_rate": 0.033906559524181104, + "loss": 0.0138, + "num_input_tokens_seen": 28288352, + "step": 31270 + }, + { + "epoch": 8.254190312788703, + "grad_norm": 0.0004255968378856778, + "learning_rate": 0.033869267617755085, + "loss": 0.0113, + "num_input_tokens_seen": 28292992, + "step": 31275 + }, + { + "epoch": 8.2555100963442, + "grad_norm": 0.0030969104263931513, + "learning_rate": 0.0338319936201474, + "loss": 0.0181, + "num_input_tokens_seen": 28297568, + "step": 31280 + }, + { + "epoch": 8.256829879899696, + "grad_norm": 0.001321200979873538, + "learning_rate": 0.033794737537106136, + "loss": 0.0107, + "num_input_tokens_seen": 28302176, + "step": 31285 + }, + { + "epoch": 8.258149663455193, + "grad_norm": 0.0028185138944536448, + "learning_rate": 0.03375749937437671, + "loss": 0.0249, + "num_input_tokens_seen": 28306720, + "step": 31290 + }, + { + "epoch": 8.25946944701069, + "grad_norm": 0.0003754365898203105, + "learning_rate": 0.033720279137701634, + "loss": 0.0217, + "num_input_tokens_seen": 28311296, + "step": 31295 + }, + { + "epoch": 8.260789230566187, + "grad_norm": 0.003036327427253127, + "learning_rate": 0.03368307683282078, + "loss": 0.0133, + "num_input_tokens_seen": 28315744, + "step": 31300 + }, + { + "epoch": 8.262109014121684, + "grad_norm": 0.0021348430309444666, + "learning_rate": 0.033645892465471235, + "loss": 0.0295, + "num_input_tokens_seen": 28320160, + "step": 31305 + }, + { + "epoch": 8.26342879767718, + "grad_norm": 0.002979624317958951, + "learning_rate": 0.03360872604138724, + "loss": 0.017, + "num_input_tokens_seen": 28324576, + "step": 31310 + }, + { + "epoch": 8.264748581232677, + "grad_norm": 0.004622132517397404, + "learning_rate": 0.03357157756630034, + "loss": 0.0298, + "num_input_tokens_seen": 28329152, + "step": 31315 + }, + { + "epoch": 8.266068364788175, + "grad_norm": 0.0004310408257879317, + "learning_rate": 0.033534447045939365, + "loss": 0.0289, + "num_input_tokens_seen": 28333568, + "step": 31320 + }, + { + "epoch": 8.267388148343672, + "grad_norm": 0.001722470624372363, + "learning_rate": 0.03349733448603026, + "loss": 0.0344, + "num_input_tokens_seen": 28338112, + "step": 31325 + }, + { + "epoch": 8.268707931899169, + "grad_norm": 0.0019457296002656221, + "learning_rate": 0.03346023989229619, + "loss": 0.0224, + "num_input_tokens_seen": 28342560, + "step": 31330 + }, + { + "epoch": 8.270027715454665, + "grad_norm": 0.0033967597410082817, + "learning_rate": 0.03342316327045769, + "loss": 0.041, + "num_input_tokens_seen": 28347232, + "step": 31335 + }, + { + "epoch": 8.271347499010162, + "grad_norm": 0.00286977575160563, + "learning_rate": 0.033386104626232385, + "loss": 0.0371, + "num_input_tokens_seen": 28351552, + "step": 31340 + }, + { + "epoch": 8.27266728256566, + "grad_norm": 0.00025229857419617474, + "learning_rate": 0.03334906396533525, + "loss": 0.0556, + "num_input_tokens_seen": 28355936, + "step": 31345 + }, + { + "epoch": 8.273987066121157, + "grad_norm": 0.0026021527592092752, + "learning_rate": 0.033312041293478326, + "loss": 0.0281, + "num_input_tokens_seen": 28360608, + "step": 31350 + }, + { + "epoch": 8.275306849676653, + "grad_norm": 0.003551360685378313, + "learning_rate": 0.03327503661637103, + "loss": 0.0481, + "num_input_tokens_seen": 28365152, + "step": 31355 + }, + { + "epoch": 8.27662663323215, + "grad_norm": 0.00028498118626885116, + "learning_rate": 0.03323804993971998, + "loss": 0.0314, + "num_input_tokens_seen": 28369984, + "step": 31360 + }, + { + "epoch": 8.277946416787646, + "grad_norm": 0.0007018196629360318, + "learning_rate": 0.033201081269228924, + "loss": 0.0293, + "num_input_tokens_seen": 28374336, + "step": 31365 + }, + { + "epoch": 8.279266200343145, + "grad_norm": 0.0020542084239423275, + "learning_rate": 0.03316413061059895, + "loss": 0.0486, + "num_input_tokens_seen": 28378752, + "step": 31370 + }, + { + "epoch": 8.280585983898641, + "grad_norm": 0.0005139259155839682, + "learning_rate": 0.03312719796952827, + "loss": 0.0185, + "num_input_tokens_seen": 28383424, + "step": 31375 + }, + { + "epoch": 8.281905767454138, + "grad_norm": 0.0017096401425078511, + "learning_rate": 0.03309028335171236, + "loss": 0.0322, + "num_input_tokens_seen": 28387840, + "step": 31380 + }, + { + "epoch": 8.283225551009634, + "grad_norm": 0.002571436343714595, + "learning_rate": 0.03305338676284398, + "loss": 0.0216, + "num_input_tokens_seen": 28392384, + "step": 31385 + }, + { + "epoch": 8.28454533456513, + "grad_norm": 0.00020912739273626357, + "learning_rate": 0.03301650820861296, + "loss": 0.0117, + "num_input_tokens_seen": 28397024, + "step": 31390 + }, + { + "epoch": 8.285865118120629, + "grad_norm": 0.0008358114282600582, + "learning_rate": 0.03297964769470652, + "loss": 0.0272, + "num_input_tokens_seen": 28401856, + "step": 31395 + }, + { + "epoch": 8.287184901676126, + "grad_norm": 0.0015819764230400324, + "learning_rate": 0.032942805226808945, + "loss": 0.0373, + "num_input_tokens_seen": 28406464, + "step": 31400 + }, + { + "epoch": 8.287184901676126, + "eval_loss": 0.10240993648767471, + "eval_runtime": 75.95, + "eval_samples_per_second": 88.677, + "eval_steps_per_second": 22.172, + "num_input_tokens_seen": 28406464, + "step": 31400 + }, + { + "epoch": 8.288504685231622, + "grad_norm": 0.0025610169395804405, + "learning_rate": 0.03290598081060187, + "loss": 0.0131, + "num_input_tokens_seen": 28411200, + "step": 31405 + }, + { + "epoch": 8.289824468787119, + "grad_norm": 0.0004199015093035996, + "learning_rate": 0.03286917445176407, + "loss": 0.0076, + "num_input_tokens_seen": 28416192, + "step": 31410 + }, + { + "epoch": 8.291144252342615, + "grad_norm": 0.0009033746900968254, + "learning_rate": 0.032832386155971456, + "loss": 0.02, + "num_input_tokens_seen": 28420800, + "step": 31415 + }, + { + "epoch": 8.292464035898112, + "grad_norm": 0.0012214331654831767, + "learning_rate": 0.032795615928897334, + "loss": 0.01, + "num_input_tokens_seen": 28425184, + "step": 31420 + }, + { + "epoch": 8.29378381945361, + "grad_norm": 0.0005339792696759105, + "learning_rate": 0.03275886377621215, + "loss": 0.028, + "num_input_tokens_seen": 28429760, + "step": 31425 + }, + { + "epoch": 8.295103603009107, + "grad_norm": 0.0006726306164637208, + "learning_rate": 0.03272212970358348, + "loss": 0.0154, + "num_input_tokens_seen": 28434464, + "step": 31430 + }, + { + "epoch": 8.296423386564603, + "grad_norm": 0.0006414023227989674, + "learning_rate": 0.032685413716676215, + "loss": 0.0112, + "num_input_tokens_seen": 28439008, + "step": 31435 + }, + { + "epoch": 8.2977431701201, + "grad_norm": 0.0015634900191798806, + "learning_rate": 0.032648715821152474, + "loss": 0.0328, + "num_input_tokens_seen": 28443296, + "step": 31440 + }, + { + "epoch": 8.299062953675596, + "grad_norm": 0.0011444634292274714, + "learning_rate": 0.03261203602267143, + "loss": 0.0143, + "num_input_tokens_seen": 28448032, + "step": 31445 + }, + { + "epoch": 8.300382737231095, + "grad_norm": 0.0028404637705534697, + "learning_rate": 0.03257537432688966, + "loss": 0.0307, + "num_input_tokens_seen": 28452640, + "step": 31450 + }, + { + "epoch": 8.301702520786591, + "grad_norm": 0.0015350126195698977, + "learning_rate": 0.03253873073946077, + "loss": 0.0477, + "num_input_tokens_seen": 28456960, + "step": 31455 + }, + { + "epoch": 8.303022304342088, + "grad_norm": 0.0005121941794641316, + "learning_rate": 0.03250210526603572, + "loss": 0.0252, + "num_input_tokens_seen": 28461440, + "step": 31460 + }, + { + "epoch": 8.304342087897584, + "grad_norm": 0.0029087329749017954, + "learning_rate": 0.03246549791226266, + "loss": 0.0493, + "num_input_tokens_seen": 28466176, + "step": 31465 + }, + { + "epoch": 8.305661871453081, + "grad_norm": 0.002002989873290062, + "learning_rate": 0.03242890868378679, + "loss": 0.0235, + "num_input_tokens_seen": 28470528, + "step": 31470 + }, + { + "epoch": 8.30698165500858, + "grad_norm": 0.002383690793067217, + "learning_rate": 0.03239233758625074, + "loss": 0.0243, + "num_input_tokens_seen": 28474848, + "step": 31475 + }, + { + "epoch": 8.308301438564076, + "grad_norm": 0.0001610494509804994, + "learning_rate": 0.032355784625294204, + "loss": 0.013, + "num_input_tokens_seen": 28479424, + "step": 31480 + }, + { + "epoch": 8.309621222119572, + "grad_norm": 0.0013886961387470365, + "learning_rate": 0.03231924980655402, + "loss": 0.0084, + "num_input_tokens_seen": 28483936, + "step": 31485 + }, + { + "epoch": 8.310941005675069, + "grad_norm": 0.0009585957159288228, + "learning_rate": 0.032282733135664446, + "loss": 0.0303, + "num_input_tokens_seen": 28488384, + "step": 31490 + }, + { + "epoch": 8.312260789230566, + "grad_norm": 0.00043026782805100083, + "learning_rate": 0.03224623461825669, + "loss": 0.0191, + "num_input_tokens_seen": 28492832, + "step": 31495 + }, + { + "epoch": 8.313580572786064, + "grad_norm": 0.00042616319842636585, + "learning_rate": 0.03220975425995937, + "loss": 0.0243, + "num_input_tokens_seen": 28497088, + "step": 31500 + }, + { + "epoch": 8.31490035634156, + "grad_norm": 0.0010143907275050879, + "learning_rate": 0.032173292066398206, + "loss": 0.0074, + "num_input_tokens_seen": 28501408, + "step": 31505 + }, + { + "epoch": 8.316220139897057, + "grad_norm": 0.005143147427588701, + "learning_rate": 0.03213684804319606, + "loss": 0.0252, + "num_input_tokens_seen": 28505920, + "step": 31510 + }, + { + "epoch": 8.317539923452554, + "grad_norm": 0.000789240060839802, + "learning_rate": 0.03210042219597312, + "loss": 0.0364, + "num_input_tokens_seen": 28510432, + "step": 31515 + }, + { + "epoch": 8.31885970700805, + "grad_norm": 0.0012697038473561406, + "learning_rate": 0.03206401453034675, + "loss": 0.0119, + "num_input_tokens_seen": 28514944, + "step": 31520 + }, + { + "epoch": 8.320179490563547, + "grad_norm": 0.00410977378487587, + "learning_rate": 0.03202762505193136, + "loss": 0.0173, + "num_input_tokens_seen": 28519648, + "step": 31525 + }, + { + "epoch": 8.321499274119045, + "grad_norm": 0.0018725335830822587, + "learning_rate": 0.031991253766338754, + "loss": 0.0071, + "num_input_tokens_seen": 28524064, + "step": 31530 + }, + { + "epoch": 8.322819057674542, + "grad_norm": 0.003158206818625331, + "learning_rate": 0.03195490067917778, + "loss": 0.0397, + "num_input_tokens_seen": 28528544, + "step": 31535 + }, + { + "epoch": 8.324138841230038, + "grad_norm": 0.00021897576516494155, + "learning_rate": 0.03191856579605461, + "loss": 0.0121, + "num_input_tokens_seen": 28533152, + "step": 31540 + }, + { + "epoch": 8.325458624785535, + "grad_norm": 0.0027591597754508257, + "learning_rate": 0.031882249122572454, + "loss": 0.0316, + "num_input_tokens_seen": 28537824, + "step": 31545 + }, + { + "epoch": 8.326778408341031, + "grad_norm": 0.0041494653560221195, + "learning_rate": 0.03184595066433188, + "loss": 0.0157, + "num_input_tokens_seen": 28541984, + "step": 31550 + }, + { + "epoch": 8.32809819189653, + "grad_norm": 0.0006657605990767479, + "learning_rate": 0.03180967042693049, + "loss": 0.016, + "num_input_tokens_seen": 28546976, + "step": 31555 + }, + { + "epoch": 8.329417975452026, + "grad_norm": 0.0003083054325543344, + "learning_rate": 0.03177340841596323, + "loss": 0.0102, + "num_input_tokens_seen": 28551424, + "step": 31560 + }, + { + "epoch": 8.330737759007523, + "grad_norm": 0.00254820310510695, + "learning_rate": 0.03173716463702209, + "loss": 0.0496, + "num_input_tokens_seen": 28556032, + "step": 31565 + }, + { + "epoch": 8.33205754256302, + "grad_norm": 0.0006841448484919965, + "learning_rate": 0.03170093909569638, + "loss": 0.0249, + "num_input_tokens_seen": 28560352, + "step": 31570 + }, + { + "epoch": 8.333377326118516, + "grad_norm": 0.0033353199250996113, + "learning_rate": 0.03166473179757246, + "loss": 0.031, + "num_input_tokens_seen": 28564416, + "step": 31575 + }, + { + "epoch": 8.334697109674014, + "grad_norm": 0.003236260963603854, + "learning_rate": 0.031628542748234005, + "loss": 0.0482, + "num_input_tokens_seen": 28569056, + "step": 31580 + }, + { + "epoch": 8.33601689322951, + "grad_norm": 0.0012064194306731224, + "learning_rate": 0.03159237195326184, + "loss": 0.0169, + "num_input_tokens_seen": 28573600, + "step": 31585 + }, + { + "epoch": 8.337336676785007, + "grad_norm": 0.002123628742992878, + "learning_rate": 0.031556219418233875, + "loss": 0.038, + "num_input_tokens_seen": 28578272, + "step": 31590 + }, + { + "epoch": 8.338656460340504, + "grad_norm": 0.0030519256833940744, + "learning_rate": 0.03152008514872533, + "loss": 0.0336, + "num_input_tokens_seen": 28582624, + "step": 31595 + }, + { + "epoch": 8.339976243896, + "grad_norm": 0.00101479422301054, + "learning_rate": 0.03148396915030862, + "loss": 0.0261, + "num_input_tokens_seen": 28587264, + "step": 31600 + }, + { + "epoch": 8.339976243896, + "eval_loss": 0.10330627113580704, + "eval_runtime": 75.8038, + "eval_samples_per_second": 88.848, + "eval_steps_per_second": 22.215, + "num_input_tokens_seen": 28587264, + "step": 31600 + }, + { + "epoch": 8.341296027451499, + "grad_norm": 0.0001661147689446807, + "learning_rate": 0.03144787142855318, + "loss": 0.0237, + "num_input_tokens_seen": 28591616, + "step": 31605 + }, + { + "epoch": 8.342615811006995, + "grad_norm": 0.000517295440658927, + "learning_rate": 0.031411791989025835, + "loss": 0.0201, + "num_input_tokens_seen": 28596192, + "step": 31610 + }, + { + "epoch": 8.343935594562492, + "grad_norm": 0.0064855460077524185, + "learning_rate": 0.031375730837290394, + "loss": 0.0243, + "num_input_tokens_seen": 28600800, + "step": 31615 + }, + { + "epoch": 8.345255378117988, + "grad_norm": 0.0029399788472801447, + "learning_rate": 0.031339687978908015, + "loss": 0.0156, + "num_input_tokens_seen": 28605376, + "step": 31620 + }, + { + "epoch": 8.346575161673485, + "grad_norm": 0.00048632436664775014, + "learning_rate": 0.03130366341943694, + "loss": 0.0156, + "num_input_tokens_seen": 28609856, + "step": 31625 + }, + { + "epoch": 8.347894945228983, + "grad_norm": 0.0042504617013037205, + "learning_rate": 0.031267657164432555, + "loss": 0.0235, + "num_input_tokens_seen": 28614400, + "step": 31630 + }, + { + "epoch": 8.34921472878448, + "grad_norm": 0.0010617101797834039, + "learning_rate": 0.03123166921944752, + "loss": 0.008, + "num_input_tokens_seen": 28618816, + "step": 31635 + }, + { + "epoch": 8.350534512339976, + "grad_norm": 0.0023289916571229696, + "learning_rate": 0.031195699590031666, + "loss": 0.038, + "num_input_tokens_seen": 28623040, + "step": 31640 + }, + { + "epoch": 8.351854295895473, + "grad_norm": 0.002153372159227729, + "learning_rate": 0.031159748281731885, + "loss": 0.0107, + "num_input_tokens_seen": 28627392, + "step": 31645 + }, + { + "epoch": 8.35317407945097, + "grad_norm": 0.002925469074398279, + "learning_rate": 0.031123815300092394, + "loss": 0.023, + "num_input_tokens_seen": 28632128, + "step": 31650 + }, + { + "epoch": 8.354493863006468, + "grad_norm": 0.004415068309754133, + "learning_rate": 0.031087900650654424, + "loss": 0.0378, + "num_input_tokens_seen": 28636832, + "step": 31655 + }, + { + "epoch": 8.355813646561964, + "grad_norm": 0.0009347698069177568, + "learning_rate": 0.031052004338956534, + "loss": 0.0112, + "num_input_tokens_seen": 28641280, + "step": 31660 + }, + { + "epoch": 8.357133430117461, + "grad_norm": 0.0004972394672222435, + "learning_rate": 0.031016126370534407, + "loss": 0.0067, + "num_input_tokens_seen": 28645792, + "step": 31665 + }, + { + "epoch": 8.358453213672957, + "grad_norm": 0.0008901755209080875, + "learning_rate": 0.030980266750920804, + "loss": 0.0142, + "num_input_tokens_seen": 28650048, + "step": 31670 + }, + { + "epoch": 8.359772997228454, + "grad_norm": 0.0016061600763350725, + "learning_rate": 0.030944425485645747, + "loss": 0.0139, + "num_input_tokens_seen": 28654496, + "step": 31675 + }, + { + "epoch": 8.36109278078395, + "grad_norm": 0.006048365030437708, + "learning_rate": 0.03090860258023647, + "loss": 0.0523, + "num_input_tokens_seen": 28659040, + "step": 31680 + }, + { + "epoch": 8.362412564339449, + "grad_norm": 0.002956564538180828, + "learning_rate": 0.030872798040217236, + "loss": 0.0441, + "num_input_tokens_seen": 28663392, + "step": 31685 + }, + { + "epoch": 8.363732347894945, + "grad_norm": 0.00047634189832024276, + "learning_rate": 0.03083701187110964, + "loss": 0.0164, + "num_input_tokens_seen": 28667744, + "step": 31690 + }, + { + "epoch": 8.365052131450442, + "grad_norm": 0.0038309916853904724, + "learning_rate": 0.030801244078432294, + "loss": 0.024, + "num_input_tokens_seen": 28672288, + "step": 31695 + }, + { + "epoch": 8.366371915005939, + "grad_norm": 0.00035095351631753147, + "learning_rate": 0.030765494667701024, + "loss": 0.0094, + "num_input_tokens_seen": 28677120, + "step": 31700 + }, + { + "epoch": 8.367691698561435, + "grad_norm": 8.879978850018233e-05, + "learning_rate": 0.030729763644428913, + "loss": 0.0135, + "num_input_tokens_seen": 28681664, + "step": 31705 + }, + { + "epoch": 8.369011482116933, + "grad_norm": 0.003098812885582447, + "learning_rate": 0.030694051014126048, + "loss": 0.0225, + "num_input_tokens_seen": 28686112, + "step": 31710 + }, + { + "epoch": 8.37033126567243, + "grad_norm": 0.0030075281392782927, + "learning_rate": 0.030658356782299792, + "loss": 0.0218, + "num_input_tokens_seen": 28690560, + "step": 31715 + }, + { + "epoch": 8.371651049227927, + "grad_norm": 0.002940332517027855, + "learning_rate": 0.030622680954454726, + "loss": 0.0297, + "num_input_tokens_seen": 28695328, + "step": 31720 + }, + { + "epoch": 8.372970832783423, + "grad_norm": 0.0007975994958542287, + "learning_rate": 0.030587023536092398, + "loss": 0.0128, + "num_input_tokens_seen": 28699936, + "step": 31725 + }, + { + "epoch": 8.37429061633892, + "grad_norm": 0.0006568796234205365, + "learning_rate": 0.03055138453271171, + "loss": 0.0228, + "num_input_tokens_seen": 28704768, + "step": 31730 + }, + { + "epoch": 8.375610399894418, + "grad_norm": 0.0017965288134291768, + "learning_rate": 0.03051576394980858, + "loss": 0.021, + "num_input_tokens_seen": 28709248, + "step": 31735 + }, + { + "epoch": 8.376930183449915, + "grad_norm": 0.0044205449521541595, + "learning_rate": 0.030480161792876187, + "loss": 0.0205, + "num_input_tokens_seen": 28713984, + "step": 31740 + }, + { + "epoch": 8.378249967005411, + "grad_norm": 0.0010303029557690024, + "learning_rate": 0.030444578067404846, + "loss": 0.0286, + "num_input_tokens_seen": 28718432, + "step": 31745 + }, + { + "epoch": 8.379569750560908, + "grad_norm": 0.006565309129655361, + "learning_rate": 0.030409012778881975, + "loss": 0.0281, + "num_input_tokens_seen": 28722816, + "step": 31750 + }, + { + "epoch": 8.380889534116404, + "grad_norm": 0.007664374075829983, + "learning_rate": 0.030373465932792235, + "loss": 0.0232, + "num_input_tokens_seen": 28727456, + "step": 31755 + }, + { + "epoch": 8.382209317671903, + "grad_norm": 0.0016125543043017387, + "learning_rate": 0.030337937534617342, + "loss": 0.0356, + "num_input_tokens_seen": 28731872, + "step": 31760 + }, + { + "epoch": 8.3835291012274, + "grad_norm": 0.0032451709266752005, + "learning_rate": 0.030302427589836277, + "loss": 0.0255, + "num_input_tokens_seen": 28736352, + "step": 31765 + }, + { + "epoch": 8.384848884782896, + "grad_norm": 0.0002675770374480635, + "learning_rate": 0.030266936103925095, + "loss": 0.0108, + "num_input_tokens_seen": 28740896, + "step": 31770 + }, + { + "epoch": 8.386168668338392, + "grad_norm": 0.006041364744305611, + "learning_rate": 0.030231463082356982, + "loss": 0.0222, + "num_input_tokens_seen": 28745728, + "step": 31775 + }, + { + "epoch": 8.387488451893889, + "grad_norm": 0.00424201088026166, + "learning_rate": 0.030196008530602367, + "loss": 0.0213, + "num_input_tokens_seen": 28750144, + "step": 31780 + }, + { + "epoch": 8.388808235449387, + "grad_norm": 0.00020781910279765725, + "learning_rate": 0.030160572454128842, + "loss": 0.0159, + "num_input_tokens_seen": 28754592, + "step": 31785 + }, + { + "epoch": 8.390128019004884, + "grad_norm": 0.001352038816548884, + "learning_rate": 0.03012515485840098, + "loss": 0.0318, + "num_input_tokens_seen": 28758944, + "step": 31790 + }, + { + "epoch": 8.39144780256038, + "grad_norm": 0.003212234703823924, + "learning_rate": 0.030089755748880734, + "loss": 0.0119, + "num_input_tokens_seen": 28763392, + "step": 31795 + }, + { + "epoch": 8.392767586115877, + "grad_norm": 0.0014771163696423173, + "learning_rate": 0.030054375131027003, + "loss": 0.0439, + "num_input_tokens_seen": 28767840, + "step": 31800 + }, + { + "epoch": 8.392767586115877, + "eval_loss": 0.10670530050992966, + "eval_runtime": 75.889, + "eval_samples_per_second": 88.748, + "eval_steps_per_second": 22.19, + "num_input_tokens_seen": 28767840, + "step": 31800 + }, + { + "epoch": 8.394087369671373, + "grad_norm": 0.0017720364267006516, + "learning_rate": 0.030019013010295942, + "loss": 0.0275, + "num_input_tokens_seen": 28772416, + "step": 31805 + }, + { + "epoch": 8.39540715322687, + "grad_norm": 0.00039253575960174203, + "learning_rate": 0.029983669392140897, + "loss": 0.0046, + "num_input_tokens_seen": 28777056, + "step": 31810 + }, + { + "epoch": 8.396726936782368, + "grad_norm": 0.004092132672667503, + "learning_rate": 0.029948344282012217, + "loss": 0.0288, + "num_input_tokens_seen": 28781600, + "step": 31815 + }, + { + "epoch": 8.398046720337865, + "grad_norm": 0.005926496349275112, + "learning_rate": 0.029913037685357507, + "loss": 0.0389, + "num_input_tokens_seen": 28786528, + "step": 31820 + }, + { + "epoch": 8.399366503893361, + "grad_norm": 0.0031145827379077673, + "learning_rate": 0.029877749607621528, + "loss": 0.042, + "num_input_tokens_seen": 28791136, + "step": 31825 + }, + { + "epoch": 8.400686287448858, + "grad_norm": 0.0019173186738044024, + "learning_rate": 0.029842480054246077, + "loss": 0.0125, + "num_input_tokens_seen": 28795520, + "step": 31830 + }, + { + "epoch": 8.402006071004354, + "grad_norm": 0.0003658223431557417, + "learning_rate": 0.02980722903067022, + "loss": 0.0044, + "num_input_tokens_seen": 28800128, + "step": 31835 + }, + { + "epoch": 8.403325854559853, + "grad_norm": 0.005472165998071432, + "learning_rate": 0.029771996542330113, + "loss": 0.0441, + "num_input_tokens_seen": 28804512, + "step": 31840 + }, + { + "epoch": 8.40464563811535, + "grad_norm": 0.0003373645304236561, + "learning_rate": 0.029736782594658954, + "loss": 0.0085, + "num_input_tokens_seen": 28808992, + "step": 31845 + }, + { + "epoch": 8.405965421670846, + "grad_norm": 0.0003854914684779942, + "learning_rate": 0.029701587193087284, + "loss": 0.049, + "num_input_tokens_seen": 28813440, + "step": 31850 + }, + { + "epoch": 8.407285205226342, + "grad_norm": 0.0013551892479881644, + "learning_rate": 0.0296664103430426, + "loss": 0.0117, + "num_input_tokens_seen": 28818304, + "step": 31855 + }, + { + "epoch": 8.408604988781839, + "grad_norm": 0.0018258062191307545, + "learning_rate": 0.029631252049949652, + "loss": 0.026, + "num_input_tokens_seen": 28822528, + "step": 31860 + }, + { + "epoch": 8.409924772337337, + "grad_norm": 0.00751732662320137, + "learning_rate": 0.02959611231923031, + "loss": 0.0534, + "num_input_tokens_seen": 28827072, + "step": 31865 + }, + { + "epoch": 8.411244555892834, + "grad_norm": 0.0023578759282827377, + "learning_rate": 0.029560991156303507, + "loss": 0.0159, + "num_input_tokens_seen": 28831616, + "step": 31870 + }, + { + "epoch": 8.41256433944833, + "grad_norm": 0.00311439111828804, + "learning_rate": 0.02952588856658544, + "loss": 0.0543, + "num_input_tokens_seen": 28836256, + "step": 31875 + }, + { + "epoch": 8.413884123003827, + "grad_norm": 0.001537169562652707, + "learning_rate": 0.029490804555489296, + "loss": 0.031, + "num_input_tokens_seen": 28840896, + "step": 31880 + }, + { + "epoch": 8.415203906559324, + "grad_norm": 0.0009642604272812605, + "learning_rate": 0.029455739128425484, + "loss": 0.0191, + "num_input_tokens_seen": 28845216, + "step": 31885 + }, + { + "epoch": 8.416523690114822, + "grad_norm": 0.0019743083976209164, + "learning_rate": 0.029420692290801607, + "loss": 0.0213, + "num_input_tokens_seen": 28849664, + "step": 31890 + }, + { + "epoch": 8.417843473670318, + "grad_norm": 0.0009718748042359948, + "learning_rate": 0.02938566404802223, + "loss": 0.0272, + "num_input_tokens_seen": 28854432, + "step": 31895 + }, + { + "epoch": 8.419163257225815, + "grad_norm": 0.0005822524544782937, + "learning_rate": 0.029350654405489195, + "loss": 0.0129, + "num_input_tokens_seen": 28859104, + "step": 31900 + }, + { + "epoch": 8.420483040781312, + "grad_norm": 0.002708901185542345, + "learning_rate": 0.02931566336860145, + "loss": 0.0154, + "num_input_tokens_seen": 28863520, + "step": 31905 + }, + { + "epoch": 8.421802824336808, + "grad_norm": 0.002574399346485734, + "learning_rate": 0.02928069094275505, + "loss": 0.0066, + "num_input_tokens_seen": 28868128, + "step": 31910 + }, + { + "epoch": 8.423122607892306, + "grad_norm": 0.003161190077662468, + "learning_rate": 0.02924573713334314, + "loss": 0.022, + "num_input_tokens_seen": 28872736, + "step": 31915 + }, + { + "epoch": 8.424442391447803, + "grad_norm": 0.0001054096719599329, + "learning_rate": 0.02921080194575603, + "loss": 0.0362, + "num_input_tokens_seen": 28877344, + "step": 31920 + }, + { + "epoch": 8.4257621750033, + "grad_norm": 0.0003531776601448655, + "learning_rate": 0.029175885385381177, + "loss": 0.0279, + "num_input_tokens_seen": 28881984, + "step": 31925 + }, + { + "epoch": 8.427081958558796, + "grad_norm": 0.0020966343581676483, + "learning_rate": 0.029140987457603223, + "loss": 0.025, + "num_input_tokens_seen": 28886144, + "step": 31930 + }, + { + "epoch": 8.428401742114293, + "grad_norm": 0.0020001009106636047, + "learning_rate": 0.029106108167803763, + "loss": 0.0723, + "num_input_tokens_seen": 28890560, + "step": 31935 + }, + { + "epoch": 8.429721525669791, + "grad_norm": 0.002642520470544696, + "learning_rate": 0.029071247521361674, + "loss": 0.0465, + "num_input_tokens_seen": 28895168, + "step": 31940 + }, + { + "epoch": 8.431041309225288, + "grad_norm": 0.003352012950927019, + "learning_rate": 0.029036405523652945, + "loss": 0.0193, + "num_input_tokens_seen": 28899712, + "step": 31945 + }, + { + "epoch": 8.432361092780784, + "grad_norm": 0.003501463681459427, + "learning_rate": 0.029001582180050577, + "loss": 0.016, + "num_input_tokens_seen": 28904096, + "step": 31950 + }, + { + "epoch": 8.43368087633628, + "grad_norm": 0.0037893676199018955, + "learning_rate": 0.02896677749592482, + "loss": 0.0557, + "num_input_tokens_seen": 28908576, + "step": 31955 + }, + { + "epoch": 8.435000659891777, + "grad_norm": 0.002216750057414174, + "learning_rate": 0.028931991476642938, + "loss": 0.0228, + "num_input_tokens_seen": 28912832, + "step": 31960 + }, + { + "epoch": 8.436320443447274, + "grad_norm": 0.0022787200286984444, + "learning_rate": 0.028897224127569412, + "loss": 0.0442, + "num_input_tokens_seen": 28917376, + "step": 31965 + }, + { + "epoch": 8.437640227002772, + "grad_norm": 0.004912621807307005, + "learning_rate": 0.028862475454065832, + "loss": 0.0208, + "num_input_tokens_seen": 28922048, + "step": 31970 + }, + { + "epoch": 8.438960010558269, + "grad_norm": 0.0014445940032601357, + "learning_rate": 0.028827745461490806, + "loss": 0.0192, + "num_input_tokens_seen": 28926240, + "step": 31975 + }, + { + "epoch": 8.440279794113765, + "grad_norm": 0.002984154736623168, + "learning_rate": 0.028793034155200212, + "loss": 0.0278, + "num_input_tokens_seen": 28930720, + "step": 31980 + }, + { + "epoch": 8.441599577669262, + "grad_norm": 0.002897893078625202, + "learning_rate": 0.028758341540546944, + "loss": 0.0556, + "num_input_tokens_seen": 28935360, + "step": 31985 + }, + { + "epoch": 8.442919361224758, + "grad_norm": 0.0012094095582142472, + "learning_rate": 0.02872366762288098, + "loss": 0.0494, + "num_input_tokens_seen": 28939744, + "step": 31990 + }, + { + "epoch": 8.444239144780257, + "grad_norm": 0.0017731826519593596, + "learning_rate": 0.028689012407549567, + "loss": 0.0166, + "num_input_tokens_seen": 28944192, + "step": 31995 + }, + { + "epoch": 8.445558928335753, + "grad_norm": 0.0016990129370242357, + "learning_rate": 0.028654375899896892, + "loss": 0.0105, + "num_input_tokens_seen": 28948576, + "step": 32000 + }, + { + "epoch": 8.445558928335753, + "eval_loss": 0.10139412432909012, + "eval_runtime": 75.7798, + "eval_samples_per_second": 88.876, + "eval_steps_per_second": 22.222, + "num_input_tokens_seen": 28948576, + "step": 32000 + }, + { + "epoch": 8.44687871189125, + "grad_norm": 0.0034328000620007515, + "learning_rate": 0.02861975810526437, + "loss": 0.0176, + "num_input_tokens_seen": 28953088, + "step": 32005 + }, + { + "epoch": 8.448198495446746, + "grad_norm": 0.005156516097486019, + "learning_rate": 0.02858515902899056, + "loss": 0.0166, + "num_input_tokens_seen": 28957664, + "step": 32010 + }, + { + "epoch": 8.449518279002243, + "grad_norm": 0.0022410042583942413, + "learning_rate": 0.028550578676410976, + "loss": 0.0185, + "num_input_tokens_seen": 28961952, + "step": 32015 + }, + { + "epoch": 8.450838062557741, + "grad_norm": 0.005471688695251942, + "learning_rate": 0.02851601705285837, + "loss": 0.0315, + "num_input_tokens_seen": 28966560, + "step": 32020 + }, + { + "epoch": 8.452157846113238, + "grad_norm": 0.0008063002023845911, + "learning_rate": 0.028481474163662666, + "loss": 0.0158, + "num_input_tokens_seen": 28971104, + "step": 32025 + }, + { + "epoch": 8.453477629668734, + "grad_norm": 0.003510295180603862, + "learning_rate": 0.028446950014150683, + "loss": 0.0524, + "num_input_tokens_seen": 28975328, + "step": 32030 + }, + { + "epoch": 8.454797413224231, + "grad_norm": 0.0031282035633921623, + "learning_rate": 0.028412444609646596, + "loss": 0.0176, + "num_input_tokens_seen": 28979840, + "step": 32035 + }, + { + "epoch": 8.456117196779728, + "grad_norm": 0.00017136878159362823, + "learning_rate": 0.028377957955471465, + "loss": 0.0104, + "num_input_tokens_seen": 28984544, + "step": 32040 + }, + { + "epoch": 8.457436980335226, + "grad_norm": 0.003058303613215685, + "learning_rate": 0.0283434900569436, + "loss": 0.0239, + "num_input_tokens_seen": 28988896, + "step": 32045 + }, + { + "epoch": 8.458756763890722, + "grad_norm": 0.0033735211472958326, + "learning_rate": 0.028309040919378456, + "loss": 0.009, + "num_input_tokens_seen": 28993344, + "step": 32050 + }, + { + "epoch": 8.460076547446219, + "grad_norm": 0.0019898577593266964, + "learning_rate": 0.02827461054808848, + "loss": 0.0184, + "num_input_tokens_seen": 28997792, + "step": 32055 + }, + { + "epoch": 8.461396331001716, + "grad_norm": 0.0020898645743727684, + "learning_rate": 0.028240198948383186, + "loss": 0.0106, + "num_input_tokens_seen": 29002368, + "step": 32060 + }, + { + "epoch": 8.462716114557212, + "grad_norm": 0.0009008743800222874, + "learning_rate": 0.028205806125569402, + "loss": 0.0168, + "num_input_tokens_seen": 29006912, + "step": 32065 + }, + { + "epoch": 8.464035898112709, + "grad_norm": 0.00044619993423111737, + "learning_rate": 0.028171432084950834, + "loss": 0.0142, + "num_input_tokens_seen": 29011552, + "step": 32070 + }, + { + "epoch": 8.465355681668207, + "grad_norm": 0.0005287977401167154, + "learning_rate": 0.028137076831828478, + "loss": 0.0352, + "num_input_tokens_seen": 29016064, + "step": 32075 + }, + { + "epoch": 8.466675465223704, + "grad_norm": 0.0039238641038537025, + "learning_rate": 0.028102740371500238, + "loss": 0.0552, + "num_input_tokens_seen": 29020320, + "step": 32080 + }, + { + "epoch": 8.4679952487792, + "grad_norm": 0.0034143265802413225, + "learning_rate": 0.0280684227092613, + "loss": 0.0406, + "num_input_tokens_seen": 29024960, + "step": 32085 + }, + { + "epoch": 8.469315032334697, + "grad_norm": 0.0020780011545866728, + "learning_rate": 0.02803412385040392, + "loss": 0.039, + "num_input_tokens_seen": 29029472, + "step": 32090 + }, + { + "epoch": 8.470634815890193, + "grad_norm": 0.003670072415843606, + "learning_rate": 0.027999843800217306, + "loss": 0.0282, + "num_input_tokens_seen": 29034016, + "step": 32095 + }, + { + "epoch": 8.471954599445692, + "grad_norm": 0.0019833804108202457, + "learning_rate": 0.027965582563987932, + "loss": 0.0188, + "num_input_tokens_seen": 29038624, + "step": 32100 + }, + { + "epoch": 8.473274383001188, + "grad_norm": 0.0033420599065721035, + "learning_rate": 0.027931340146999346, + "loss": 0.0453, + "num_input_tokens_seen": 29043232, + "step": 32105 + }, + { + "epoch": 8.474594166556685, + "grad_norm": 0.0007732933154329658, + "learning_rate": 0.02789711655453208, + "loss": 0.0112, + "num_input_tokens_seen": 29047744, + "step": 32110 + }, + { + "epoch": 8.475913950112181, + "grad_norm": 0.0001411807897966355, + "learning_rate": 0.02786291179186392, + "loss": 0.0302, + "num_input_tokens_seen": 29052576, + "step": 32115 + }, + { + "epoch": 8.477233733667678, + "grad_norm": 0.0005027725710533559, + "learning_rate": 0.02782872586426961, + "loss": 0.0205, + "num_input_tokens_seen": 29057312, + "step": 32120 + }, + { + "epoch": 8.478553517223176, + "grad_norm": 0.0030093647073954344, + "learning_rate": 0.027794558777021083, + "loss": 0.0481, + "num_input_tokens_seen": 29061984, + "step": 32125 + }, + { + "epoch": 8.479873300778673, + "grad_norm": 0.00207646656781435, + "learning_rate": 0.02776041053538734, + "loss": 0.01, + "num_input_tokens_seen": 29066592, + "step": 32130 + }, + { + "epoch": 8.48119308433417, + "grad_norm": 0.0008397996425628662, + "learning_rate": 0.027726281144634407, + "loss": 0.006, + "num_input_tokens_seen": 29071456, + "step": 32135 + }, + { + "epoch": 8.482512867889666, + "grad_norm": 0.0006196345202624798, + "learning_rate": 0.02769217061002552, + "loss": 0.0083, + "num_input_tokens_seen": 29076096, + "step": 32140 + }, + { + "epoch": 8.483832651445162, + "grad_norm": 0.00020836539624724537, + "learning_rate": 0.027658078936820967, + "loss": 0.0482, + "num_input_tokens_seen": 29080416, + "step": 32145 + }, + { + "epoch": 8.48515243500066, + "grad_norm": 0.0022966826800256968, + "learning_rate": 0.02762400613027805, + "loss": 0.0362, + "num_input_tokens_seen": 29084864, + "step": 32150 + }, + { + "epoch": 8.486472218556157, + "grad_norm": 0.005118992179632187, + "learning_rate": 0.027589952195651295, + "loss": 0.022, + "num_input_tokens_seen": 29089600, + "step": 32155 + }, + { + "epoch": 8.487792002111654, + "grad_norm": 0.0026574130170047283, + "learning_rate": 0.027555917138192186, + "loss": 0.0186, + "num_input_tokens_seen": 29094016, + "step": 32160 + }, + { + "epoch": 8.48911178566715, + "grad_norm": 0.0012609042460098863, + "learning_rate": 0.027521900963149375, + "loss": 0.014, + "num_input_tokens_seen": 29098656, + "step": 32165 + }, + { + "epoch": 8.490431569222647, + "grad_norm": 0.00027817735099233687, + "learning_rate": 0.027487903675768633, + "loss": 0.0105, + "num_input_tokens_seen": 29103168, + "step": 32170 + }, + { + "epoch": 8.491751352778145, + "grad_norm": 0.0013459029141813517, + "learning_rate": 0.027453925281292677, + "loss": 0.0135, + "num_input_tokens_seen": 29107808, + "step": 32175 + }, + { + "epoch": 8.493071136333642, + "grad_norm": 0.002382904291152954, + "learning_rate": 0.027419965784961475, + "loss": 0.0363, + "num_input_tokens_seen": 29112448, + "step": 32180 + }, + { + "epoch": 8.494390919889138, + "grad_norm": 0.0006023160531185567, + "learning_rate": 0.027386025192012015, + "loss": 0.0226, + "num_input_tokens_seen": 29117024, + "step": 32185 + }, + { + "epoch": 8.495710703444635, + "grad_norm": 0.0012522529577836394, + "learning_rate": 0.027352103507678277, + "loss": 0.021, + "num_input_tokens_seen": 29121664, + "step": 32190 + }, + { + "epoch": 8.497030487000131, + "grad_norm": 0.0003631329454947263, + "learning_rate": 0.027318200737191527, + "loss": 0.0128, + "num_input_tokens_seen": 29126048, + "step": 32195 + }, + { + "epoch": 8.49835027055563, + "grad_norm": 0.004403192549943924, + "learning_rate": 0.027284316885779935, + "loss": 0.0402, + "num_input_tokens_seen": 29130656, + "step": 32200 + }, + { + "epoch": 8.49835027055563, + "eval_loss": 0.10273775458335876, + "eval_runtime": 75.9332, + "eval_samples_per_second": 88.696, + "eval_steps_per_second": 22.177, + "num_input_tokens_seen": 29130656, + "step": 32200 + }, + { + "epoch": 8.499670054111126, + "grad_norm": 0.002173693384975195, + "learning_rate": 0.027250451958668785, + "loss": 0.0348, + "num_input_tokens_seen": 29135072, + "step": 32205 + }, + { + "epoch": 8.500989837666623, + "grad_norm": 0.003293020883575082, + "learning_rate": 0.027216605961080536, + "loss": 0.0232, + "num_input_tokens_seen": 29139456, + "step": 32210 + }, + { + "epoch": 8.50230962122212, + "grad_norm": 0.00043379314593039453, + "learning_rate": 0.02718277889823461, + "loss": 0.0114, + "num_input_tokens_seen": 29144032, + "step": 32215 + }, + { + "epoch": 8.503629404777616, + "grad_norm": 0.0027792523615062237, + "learning_rate": 0.027148970775347604, + "loss": 0.016, + "num_input_tokens_seen": 29148352, + "step": 32220 + }, + { + "epoch": 8.504949188333114, + "grad_norm": 0.00012179859913885593, + "learning_rate": 0.027115181597633174, + "loss": 0.0357, + "num_input_tokens_seen": 29153152, + "step": 32225 + }, + { + "epoch": 8.50626897188861, + "grad_norm": 0.0009542871848680079, + "learning_rate": 0.027081411370301976, + "loss": 0.0285, + "num_input_tokens_seen": 29157408, + "step": 32230 + }, + { + "epoch": 8.507588755444107, + "grad_norm": 0.0005650736857205629, + "learning_rate": 0.027047660098561875, + "loss": 0.0116, + "num_input_tokens_seen": 29161952, + "step": 32235 + }, + { + "epoch": 8.508908538999604, + "grad_norm": 0.003108638571575284, + "learning_rate": 0.02701392778761766, + "loss": 0.0148, + "num_input_tokens_seen": 29166400, + "step": 32240 + }, + { + "epoch": 8.5102283225551, + "grad_norm": 0.0043361796997487545, + "learning_rate": 0.02698021444267133, + "loss": 0.0105, + "num_input_tokens_seen": 29171040, + "step": 32245 + }, + { + "epoch": 8.511548106110597, + "grad_norm": 0.0002587416965980083, + "learning_rate": 0.026946520068921915, + "loss": 0.0048, + "num_input_tokens_seen": 29176032, + "step": 32250 + }, + { + "epoch": 8.512867889666095, + "grad_norm": 0.0003994312137365341, + "learning_rate": 0.02691284467156547, + "loss": 0.0218, + "num_input_tokens_seen": 29180512, + "step": 32255 + }, + { + "epoch": 8.514187673221592, + "grad_norm": 0.004266866948455572, + "learning_rate": 0.026879188255795182, + "loss": 0.0253, + "num_input_tokens_seen": 29185184, + "step": 32260 + }, + { + "epoch": 8.515507456777089, + "grad_norm": 0.003514735260978341, + "learning_rate": 0.026845550826801328, + "loss": 0.0232, + "num_input_tokens_seen": 29189888, + "step": 32265 + }, + { + "epoch": 8.516827240332585, + "grad_norm": 0.0026258970610797405, + "learning_rate": 0.02681193238977121, + "loss": 0.0172, + "num_input_tokens_seen": 29194368, + "step": 32270 + }, + { + "epoch": 8.518147023888082, + "grad_norm": 0.0011613474925979972, + "learning_rate": 0.026778332949889145, + "loss": 0.0142, + "num_input_tokens_seen": 29198784, + "step": 32275 + }, + { + "epoch": 8.51946680744358, + "grad_norm": 0.004047075752168894, + "learning_rate": 0.026744752512336673, + "loss": 0.0296, + "num_input_tokens_seen": 29203328, + "step": 32280 + }, + { + "epoch": 8.520786590999077, + "grad_norm": 0.0001725421316223219, + "learning_rate": 0.02671119108229225, + "loss": 0.0083, + "num_input_tokens_seen": 29208064, + "step": 32285 + }, + { + "epoch": 8.522106374554573, + "grad_norm": 0.0005793661694042385, + "learning_rate": 0.026677648664931556, + "loss": 0.0095, + "num_input_tokens_seen": 29212448, + "step": 32290 + }, + { + "epoch": 8.52342615811007, + "grad_norm": 0.0024481071159243584, + "learning_rate": 0.026644125265427154, + "loss": 0.0158, + "num_input_tokens_seen": 29216896, + "step": 32295 + }, + { + "epoch": 8.524745941665566, + "grad_norm": 0.0021530664525926113, + "learning_rate": 0.026610620888948822, + "loss": 0.009, + "num_input_tokens_seen": 29221472, + "step": 32300 + }, + { + "epoch": 8.526065725221065, + "grad_norm": 0.0006279167137108743, + "learning_rate": 0.026577135540663408, + "loss": 0.0104, + "num_input_tokens_seen": 29226496, + "step": 32305 + }, + { + "epoch": 8.527385508776561, + "grad_norm": 0.0003860188589897007, + "learning_rate": 0.026543669225734673, + "loss": 0.0242, + "num_input_tokens_seen": 29230752, + "step": 32310 + }, + { + "epoch": 8.528705292332058, + "grad_norm": 0.0007772828685119748, + "learning_rate": 0.02651022194932363, + "loss": 0.015, + "num_input_tokens_seen": 29235584, + "step": 32315 + }, + { + "epoch": 8.530025075887554, + "grad_norm": 0.00012684530520346016, + "learning_rate": 0.026476793716588194, + "loss": 0.009, + "num_input_tokens_seen": 29240000, + "step": 32320 + }, + { + "epoch": 8.53134485944305, + "grad_norm": 0.0021700062789022923, + "learning_rate": 0.026443384532683467, + "loss": 0.0115, + "num_input_tokens_seen": 29244416, + "step": 32325 + }, + { + "epoch": 8.532664642998547, + "grad_norm": 0.00015714674373157322, + "learning_rate": 0.026409994402761584, + "loss": 0.0072, + "num_input_tokens_seen": 29248608, + "step": 32330 + }, + { + "epoch": 8.533984426554046, + "grad_norm": 0.0009656822076067328, + "learning_rate": 0.026376623331971653, + "loss": 0.0265, + "num_input_tokens_seen": 29253152, + "step": 32335 + }, + { + "epoch": 8.535304210109542, + "grad_norm": 0.0008305701776407659, + "learning_rate": 0.026343271325459997, + "loss": 0.0215, + "num_input_tokens_seen": 29257760, + "step": 32340 + }, + { + "epoch": 8.536623993665039, + "grad_norm": 0.0006299569504335523, + "learning_rate": 0.02630993838836987, + "loss": 0.0272, + "num_input_tokens_seen": 29262272, + "step": 32345 + }, + { + "epoch": 8.537943777220535, + "grad_norm": 0.0028518028557300568, + "learning_rate": 0.026276624525841584, + "loss": 0.0109, + "num_input_tokens_seen": 29266656, + "step": 32350 + }, + { + "epoch": 8.539263560776032, + "grad_norm": 0.00011063527927035466, + "learning_rate": 0.026243329743012637, + "loss": 0.0225, + "num_input_tokens_seen": 29271392, + "step": 32355 + }, + { + "epoch": 8.54058334433153, + "grad_norm": 0.0006016341503709555, + "learning_rate": 0.026210054045017438, + "loss": 0.0082, + "num_input_tokens_seen": 29275904, + "step": 32360 + }, + { + "epoch": 8.541903127887027, + "grad_norm": 0.0052774143405258656, + "learning_rate": 0.02617679743698755, + "loss": 0.0283, + "num_input_tokens_seen": 29280288, + "step": 32365 + }, + { + "epoch": 8.543222911442523, + "grad_norm": 0.00023806824174243957, + "learning_rate": 0.02614355992405158, + "loss": 0.0421, + "num_input_tokens_seen": 29284896, + "step": 32370 + }, + { + "epoch": 8.54454269499802, + "grad_norm": 0.0023650580551475286, + "learning_rate": 0.026110341511335115, + "loss": 0.0224, + "num_input_tokens_seen": 29289280, + "step": 32375 + }, + { + "epoch": 8.545862478553516, + "grad_norm": 0.006253993138670921, + "learning_rate": 0.02607714220396093, + "loss": 0.0344, + "num_input_tokens_seen": 29293920, + "step": 32380 + }, + { + "epoch": 8.547182262109015, + "grad_norm": 0.00014592193474527448, + "learning_rate": 0.02604396200704869, + "loss": 0.0088, + "num_input_tokens_seen": 29298464, + "step": 32385 + }, + { + "epoch": 8.548502045664511, + "grad_norm": 0.0007832964765839279, + "learning_rate": 0.02601080092571523, + "loss": 0.0773, + "num_input_tokens_seen": 29302912, + "step": 32390 + }, + { + "epoch": 8.549821829220008, + "grad_norm": 0.0031135128811001778, + "learning_rate": 0.025977658965074455, + "loss": 0.0662, + "num_input_tokens_seen": 29307712, + "step": 32395 + }, + { + "epoch": 8.551141612775504, + "grad_norm": 0.0035670187789946795, + "learning_rate": 0.02594453613023719, + "loss": 0.0163, + "num_input_tokens_seen": 29312288, + "step": 32400 + }, + { + "epoch": 8.551141612775504, + "eval_loss": 0.1047176644206047, + "eval_runtime": 75.8825, + "eval_samples_per_second": 88.756, + "eval_steps_per_second": 22.192, + "num_input_tokens_seen": 29312288, + "step": 32400 + }, + { + "epoch": 8.552461396331001, + "grad_norm": 0.0030167661607265472, + "learning_rate": 0.025911432426311443, + "loss": 0.0157, + "num_input_tokens_seen": 29316768, + "step": 32405 + }, + { + "epoch": 8.5537811798865, + "grad_norm": 0.002257055602967739, + "learning_rate": 0.025878347858402234, + "loss": 0.0253, + "num_input_tokens_seen": 29321344, + "step": 32410 + }, + { + "epoch": 8.555100963441996, + "grad_norm": 0.0029573244974017143, + "learning_rate": 0.025845282431611598, + "loss": 0.0498, + "num_input_tokens_seen": 29325888, + "step": 32415 + }, + { + "epoch": 8.556420746997492, + "grad_norm": 0.0007100366638042033, + "learning_rate": 0.025812236151038608, + "loss": 0.0127, + "num_input_tokens_seen": 29330208, + "step": 32420 + }, + { + "epoch": 8.557740530552989, + "grad_norm": 0.0006329860771074891, + "learning_rate": 0.025779209021779468, + "loss": 0.0075, + "num_input_tokens_seen": 29334624, + "step": 32425 + }, + { + "epoch": 8.559060314108486, + "grad_norm": 0.0015740637900307775, + "learning_rate": 0.025746201048927324, + "loss": 0.0171, + "num_input_tokens_seen": 29339072, + "step": 32430 + }, + { + "epoch": 8.560380097663984, + "grad_norm": 0.0002573491947259754, + "learning_rate": 0.025713212237572485, + "loss": 0.0248, + "num_input_tokens_seen": 29343616, + "step": 32435 + }, + { + "epoch": 8.56169988121948, + "grad_norm": 0.0012813128996640444, + "learning_rate": 0.025680242592802164, + "loss": 0.0182, + "num_input_tokens_seen": 29348256, + "step": 32440 + }, + { + "epoch": 8.563019664774977, + "grad_norm": 0.001381404115818441, + "learning_rate": 0.02564729211970073, + "loss": 0.0288, + "num_input_tokens_seen": 29352544, + "step": 32445 + }, + { + "epoch": 8.564339448330474, + "grad_norm": 0.0019484075019136071, + "learning_rate": 0.025614360823349617, + "loss": 0.05, + "num_input_tokens_seen": 29356992, + "step": 32450 + }, + { + "epoch": 8.56565923188597, + "grad_norm": 0.00033465935848653316, + "learning_rate": 0.025581448708827146, + "loss": 0.0102, + "num_input_tokens_seen": 29361152, + "step": 32455 + }, + { + "epoch": 8.566979015441468, + "grad_norm": 0.00036750250728800893, + "learning_rate": 0.025548555781208876, + "loss": 0.0162, + "num_input_tokens_seen": 29365760, + "step": 32460 + }, + { + "epoch": 8.568298798996965, + "grad_norm": 0.0002544758899603039, + "learning_rate": 0.02551568204556721, + "loss": 0.0545, + "num_input_tokens_seen": 29370048, + "step": 32465 + }, + { + "epoch": 8.569618582552462, + "grad_norm": 0.004111867863684893, + "learning_rate": 0.02548282750697173, + "loss": 0.0107, + "num_input_tokens_seen": 29374560, + "step": 32470 + }, + { + "epoch": 8.570938366107958, + "grad_norm": 7.610884495079517e-05, + "learning_rate": 0.02544999217048909, + "loss": 0.0242, + "num_input_tokens_seen": 29379264, + "step": 32475 + }, + { + "epoch": 8.572258149663455, + "grad_norm": 0.0020769243128597736, + "learning_rate": 0.025417176041182793, + "loss": 0.019, + "num_input_tokens_seen": 29383648, + "step": 32480 + }, + { + "epoch": 8.573577933218953, + "grad_norm": 0.00011170798825332895, + "learning_rate": 0.025384379124113596, + "loss": 0.0282, + "num_input_tokens_seen": 29388096, + "step": 32485 + }, + { + "epoch": 8.57489771677445, + "grad_norm": 0.0006688199937343597, + "learning_rate": 0.025351601424339124, + "loss": 0.0352, + "num_input_tokens_seen": 29392416, + "step": 32490 + }, + { + "epoch": 8.576217500329946, + "grad_norm": 0.004703810438513756, + "learning_rate": 0.025318842946914184, + "loss": 0.0411, + "num_input_tokens_seen": 29396960, + "step": 32495 + }, + { + "epoch": 8.577537283885443, + "grad_norm": 0.004059411119669676, + "learning_rate": 0.025286103696890494, + "loss": 0.0281, + "num_input_tokens_seen": 29401664, + "step": 32500 + }, + { + "epoch": 8.57885706744094, + "grad_norm": 0.003266821848228574, + "learning_rate": 0.025253383679316836, + "loss": 0.0246, + "num_input_tokens_seen": 29406528, + "step": 32505 + }, + { + "epoch": 8.580176850996436, + "grad_norm": 0.003329961095005274, + "learning_rate": 0.025220682899239077, + "loss": 0.0278, + "num_input_tokens_seen": 29411296, + "step": 32510 + }, + { + "epoch": 8.581496634551934, + "grad_norm": 0.0012297669891268015, + "learning_rate": 0.02518800136170013, + "loss": 0.0426, + "num_input_tokens_seen": 29415840, + "step": 32515 + }, + { + "epoch": 8.58281641810743, + "grad_norm": 0.0006486919592134655, + "learning_rate": 0.02515533907173981, + "loss": 0.0376, + "num_input_tokens_seen": 29420352, + "step": 32520 + }, + { + "epoch": 8.584136201662927, + "grad_norm": 0.0014950220938771963, + "learning_rate": 0.025122696034395115, + "loss": 0.013, + "num_input_tokens_seen": 29424992, + "step": 32525 + }, + { + "epoch": 8.585455985218424, + "grad_norm": 0.003389427438378334, + "learning_rate": 0.025090072254700023, + "loss": 0.0249, + "num_input_tokens_seen": 29429472, + "step": 32530 + }, + { + "epoch": 8.58677576877392, + "grad_norm": 0.004520596470683813, + "learning_rate": 0.025057467737685468, + "loss": 0.0299, + "num_input_tokens_seen": 29434176, + "step": 32535 + }, + { + "epoch": 8.588095552329419, + "grad_norm": 0.005104459822177887, + "learning_rate": 0.025024882488379557, + "loss": 0.019, + "num_input_tokens_seen": 29438816, + "step": 32540 + }, + { + "epoch": 8.589415335884915, + "grad_norm": 0.0014738262398168445, + "learning_rate": 0.02499231651180727, + "loss": 0.0395, + "num_input_tokens_seen": 29443424, + "step": 32545 + }, + { + "epoch": 8.590735119440412, + "grad_norm": 0.0006620486965402961, + "learning_rate": 0.024959769812990713, + "loss": 0.0353, + "num_input_tokens_seen": 29447776, + "step": 32550 + }, + { + "epoch": 8.592054902995908, + "grad_norm": 0.0005390630685724318, + "learning_rate": 0.024927242396949045, + "loss": 0.039, + "num_input_tokens_seen": 29452192, + "step": 32555 + }, + { + "epoch": 8.593374686551405, + "grad_norm": 0.0018429479096084833, + "learning_rate": 0.02489473426869836, + "loss": 0.0209, + "num_input_tokens_seen": 29456576, + "step": 32560 + }, + { + "epoch": 8.594694470106903, + "grad_norm": 0.0023941337130963802, + "learning_rate": 0.024862245433251776, + "loss": 0.0216, + "num_input_tokens_seen": 29461312, + "step": 32565 + }, + { + "epoch": 8.5960142536624, + "grad_norm": 0.00073953130049631, + "learning_rate": 0.024829775895619577, + "loss": 0.0075, + "num_input_tokens_seen": 29465920, + "step": 32570 + }, + { + "epoch": 8.597334037217896, + "grad_norm": 0.0022168713621795177, + "learning_rate": 0.024797325660808882, + "loss": 0.0145, + "num_input_tokens_seen": 29470624, + "step": 32575 + }, + { + "epoch": 8.598653820773393, + "grad_norm": 0.0011107290629297495, + "learning_rate": 0.02476489473382401, + "loss": 0.0091, + "num_input_tokens_seen": 29474848, + "step": 32580 + }, + { + "epoch": 8.59997360432889, + "grad_norm": 0.0007389285019598901, + "learning_rate": 0.024732483119666127, + "loss": 0.0135, + "num_input_tokens_seen": 29479232, + "step": 32585 + }, + { + "epoch": 8.601293387884386, + "grad_norm": 0.0024832289200276136, + "learning_rate": 0.024700090823333548, + "loss": 0.0709, + "num_input_tokens_seen": 29483872, + "step": 32590 + }, + { + "epoch": 8.602613171439884, + "grad_norm": 0.004007522948086262, + "learning_rate": 0.02466771784982163, + "loss": 0.0356, + "num_input_tokens_seen": 29488096, + "step": 32595 + }, + { + "epoch": 8.603932954995381, + "grad_norm": 0.002214805455878377, + "learning_rate": 0.024635364204122594, + "loss": 0.0351, + "num_input_tokens_seen": 29492416, + "step": 32600 + }, + { + "epoch": 8.603932954995381, + "eval_loss": 0.1015215590596199, + "eval_runtime": 75.8964, + "eval_samples_per_second": 88.739, + "eval_steps_per_second": 22.188, + "num_input_tokens_seen": 29492416, + "step": 32600 + }, + { + "epoch": 8.605252738550877, + "grad_norm": 0.001103186747059226, + "learning_rate": 0.024603029891225852, + "loss": 0.0214, + "num_input_tokens_seen": 29496992, + "step": 32605 + }, + { + "epoch": 8.606572522106374, + "grad_norm": 0.005448689218610525, + "learning_rate": 0.024570714916117748, + "loss": 0.0446, + "num_input_tokens_seen": 29501504, + "step": 32610 + }, + { + "epoch": 8.60789230566187, + "grad_norm": 0.0003925523778889328, + "learning_rate": 0.024538419283781625, + "loss": 0.0074, + "num_input_tokens_seen": 29505728, + "step": 32615 + }, + { + "epoch": 8.609212089217369, + "grad_norm": 0.0011882432736456394, + "learning_rate": 0.024506142999197938, + "loss": 0.0241, + "num_input_tokens_seen": 29510048, + "step": 32620 + }, + { + "epoch": 8.610531872772865, + "grad_norm": 0.0004863201465923339, + "learning_rate": 0.024473886067344002, + "loss": 0.0189, + "num_input_tokens_seen": 29514304, + "step": 32625 + }, + { + "epoch": 8.611851656328362, + "grad_norm": 0.0004426286614034325, + "learning_rate": 0.02444164849319434, + "loss": 0.0106, + "num_input_tokens_seen": 29518912, + "step": 32630 + }, + { + "epoch": 8.613171439883859, + "grad_norm": 0.0040117609314620495, + "learning_rate": 0.024409430281720306, + "loss": 0.0251, + "num_input_tokens_seen": 29523680, + "step": 32635 + }, + { + "epoch": 8.614491223439355, + "grad_norm": 0.00047344702761620283, + "learning_rate": 0.024377231437890428, + "loss": 0.0298, + "num_input_tokens_seen": 29528064, + "step": 32640 + }, + { + "epoch": 8.615811006994853, + "grad_norm": 0.006942416075617075, + "learning_rate": 0.024345051966670115, + "loss": 0.0337, + "num_input_tokens_seen": 29532576, + "step": 32645 + }, + { + "epoch": 8.61713079055035, + "grad_norm": 0.002746804617345333, + "learning_rate": 0.024312891873021884, + "loss": 0.0222, + "num_input_tokens_seen": 29537088, + "step": 32650 + }, + { + "epoch": 8.618450574105847, + "grad_norm": 0.000871355296112597, + "learning_rate": 0.024280751161905183, + "loss": 0.0187, + "num_input_tokens_seen": 29541824, + "step": 32655 + }, + { + "epoch": 8.619770357661343, + "grad_norm": 0.0022067262325435877, + "learning_rate": 0.02424862983827658, + "loss": 0.0195, + "num_input_tokens_seen": 29546208, + "step": 32660 + }, + { + "epoch": 8.62109014121684, + "grad_norm": 0.005477902013808489, + "learning_rate": 0.024216527907089495, + "loss": 0.0503, + "num_input_tokens_seen": 29550784, + "step": 32665 + }, + { + "epoch": 8.622409924772338, + "grad_norm": 0.0010030585108324885, + "learning_rate": 0.024184445373294505, + "loss": 0.0199, + "num_input_tokens_seen": 29555040, + "step": 32670 + }, + { + "epoch": 8.623729708327835, + "grad_norm": 0.0005386820994317532, + "learning_rate": 0.02415238224183918, + "loss": 0.0152, + "num_input_tokens_seen": 29559712, + "step": 32675 + }, + { + "epoch": 8.625049491883331, + "grad_norm": 0.0017898351652547717, + "learning_rate": 0.024120338517667973, + "loss": 0.0606, + "num_input_tokens_seen": 29564224, + "step": 32680 + }, + { + "epoch": 8.626369275438828, + "grad_norm": 0.0036583933979272842, + "learning_rate": 0.02408831420572247, + "loss": 0.0532, + "num_input_tokens_seen": 29568544, + "step": 32685 + }, + { + "epoch": 8.627689058994324, + "grad_norm": 0.0029545892030000687, + "learning_rate": 0.024056309310941264, + "loss": 0.0231, + "num_input_tokens_seen": 29573056, + "step": 32690 + }, + { + "epoch": 8.629008842549823, + "grad_norm": 0.0050596147775650024, + "learning_rate": 0.02402432383825982, + "loss": 0.0663, + "num_input_tokens_seen": 29577504, + "step": 32695 + }, + { + "epoch": 8.63032862610532, + "grad_norm": 0.003352787345647812, + "learning_rate": 0.023992357792610792, + "loss": 0.031, + "num_input_tokens_seen": 29582208, + "step": 32700 + }, + { + "epoch": 8.631648409660816, + "grad_norm": 0.001099199871532619, + "learning_rate": 0.0239604111789237, + "loss": 0.0429, + "num_input_tokens_seen": 29586752, + "step": 32705 + }, + { + "epoch": 8.632968193216312, + "grad_norm": 0.001272047869861126, + "learning_rate": 0.023928484002125095, + "loss": 0.0302, + "num_input_tokens_seen": 29591136, + "step": 32710 + }, + { + "epoch": 8.634287976771809, + "grad_norm": 0.0017572568031027913, + "learning_rate": 0.023896576267138595, + "loss": 0.0351, + "num_input_tokens_seen": 29596000, + "step": 32715 + }, + { + "epoch": 8.635607760327307, + "grad_norm": 0.0014993310905992985, + "learning_rate": 0.02386468797888471, + "loss": 0.0135, + "num_input_tokens_seen": 29600288, + "step": 32720 + }, + { + "epoch": 8.636927543882804, + "grad_norm": 0.0006008573109284043, + "learning_rate": 0.023832819142281057, + "loss": 0.0094, + "num_input_tokens_seen": 29605056, + "step": 32725 + }, + { + "epoch": 8.6382473274383, + "grad_norm": 0.0014503641286864877, + "learning_rate": 0.02380096976224225, + "loss": 0.0256, + "num_input_tokens_seen": 29609184, + "step": 32730 + }, + { + "epoch": 8.639567110993797, + "grad_norm": 0.0019384643528610468, + "learning_rate": 0.023769139843679777, + "loss": 0.0156, + "num_input_tokens_seen": 29613696, + "step": 32735 + }, + { + "epoch": 8.640886894549293, + "grad_norm": 0.0004555787891149521, + "learning_rate": 0.023737329391502287, + "loss": 0.0256, + "num_input_tokens_seen": 29618112, + "step": 32740 + }, + { + "epoch": 8.642206678104792, + "grad_norm": 0.0017840510699898005, + "learning_rate": 0.023705538410615293, + "loss": 0.0223, + "num_input_tokens_seen": 29622528, + "step": 32745 + }, + { + "epoch": 8.643526461660288, + "grad_norm": 0.003534891875460744, + "learning_rate": 0.023673766905921396, + "loss": 0.0204, + "num_input_tokens_seen": 29627136, + "step": 32750 + }, + { + "epoch": 8.644846245215785, + "grad_norm": 0.0030259680934250355, + "learning_rate": 0.0236420148823202, + "loss": 0.0322, + "num_input_tokens_seen": 29631328, + "step": 32755 + }, + { + "epoch": 8.646166028771281, + "grad_norm": 0.0005702665657736361, + "learning_rate": 0.02361028234470816, + "loss": 0.0302, + "num_input_tokens_seen": 29635904, + "step": 32760 + }, + { + "epoch": 8.647485812326778, + "grad_norm": 0.0006016383413225412, + "learning_rate": 0.023578569297978913, + "loss": 0.0171, + "num_input_tokens_seen": 29640384, + "step": 32765 + }, + { + "epoch": 8.648805595882274, + "grad_norm": 0.0011725940275937319, + "learning_rate": 0.023546875747023025, + "loss": 0.02, + "num_input_tokens_seen": 29644832, + "step": 32770 + }, + { + "epoch": 8.650125379437773, + "grad_norm": 0.0007671783678233624, + "learning_rate": 0.02351520169672801, + "loss": 0.0143, + "num_input_tokens_seen": 29649376, + "step": 32775 + }, + { + "epoch": 8.65144516299327, + "grad_norm": 0.0022276805248111486, + "learning_rate": 0.023483547151978357, + "loss": 0.053, + "num_input_tokens_seen": 29654496, + "step": 32780 + }, + { + "epoch": 8.652764946548766, + "grad_norm": 0.002705046907067299, + "learning_rate": 0.023451912117655675, + "loss": 0.0192, + "num_input_tokens_seen": 29659392, + "step": 32785 + }, + { + "epoch": 8.654084730104262, + "grad_norm": 0.00046993920113891363, + "learning_rate": 0.023420296598638417, + "loss": 0.0254, + "num_input_tokens_seen": 29663808, + "step": 32790 + }, + { + "epoch": 8.655404513659759, + "grad_norm": 0.004184040240943432, + "learning_rate": 0.023388700599802165, + "loss": 0.0204, + "num_input_tokens_seen": 29668544, + "step": 32795 + }, + { + "epoch": 8.656724297215257, + "grad_norm": 0.004680674057453871, + "learning_rate": 0.023357124126019334, + "loss": 0.0599, + "num_input_tokens_seen": 29672864, + "step": 32800 + }, + { + "epoch": 8.656724297215257, + "eval_loss": 0.10233207046985626, + "eval_runtime": 75.8931, + "eval_samples_per_second": 88.743, + "eval_steps_per_second": 22.189, + "num_input_tokens_seen": 29672864, + "step": 32800 + }, + { + "epoch": 8.658044080770754, + "grad_norm": 0.0028342721052467823, + "learning_rate": 0.02332556718215945, + "loss": 0.0266, + "num_input_tokens_seen": 29677568, + "step": 32805 + }, + { + "epoch": 8.65936386432625, + "grad_norm": 0.005184524226933718, + "learning_rate": 0.023294029773089035, + "loss": 0.0369, + "num_input_tokens_seen": 29682048, + "step": 32810 + }, + { + "epoch": 8.660683647881747, + "grad_norm": 0.003296431852504611, + "learning_rate": 0.023262511903671484, + "loss": 0.0254, + "num_input_tokens_seen": 29686624, + "step": 32815 + }, + { + "epoch": 8.662003431437244, + "grad_norm": 0.0024357829242944717, + "learning_rate": 0.023231013578767324, + "loss": 0.0207, + "num_input_tokens_seen": 29691328, + "step": 32820 + }, + { + "epoch": 8.663323214992742, + "grad_norm": 0.0011357634793967009, + "learning_rate": 0.0231995348032339, + "loss": 0.0296, + "num_input_tokens_seen": 29695744, + "step": 32825 + }, + { + "epoch": 8.664642998548238, + "grad_norm": 0.0016130590811371803, + "learning_rate": 0.023168075581925685, + "loss": 0.0434, + "num_input_tokens_seen": 29700480, + "step": 32830 + }, + { + "epoch": 8.665962782103735, + "grad_norm": 0.0012732073664665222, + "learning_rate": 0.023136635919694126, + "loss": 0.0391, + "num_input_tokens_seen": 29705216, + "step": 32835 + }, + { + "epoch": 8.667282565659232, + "grad_norm": 0.0010995070915669203, + "learning_rate": 0.02310521582138753, + "loss": 0.0242, + "num_input_tokens_seen": 29709760, + "step": 32840 + }, + { + "epoch": 8.668602349214728, + "grad_norm": 0.00306746456772089, + "learning_rate": 0.023073815291851357, + "loss": 0.0261, + "num_input_tokens_seen": 29714240, + "step": 32845 + }, + { + "epoch": 8.669922132770226, + "grad_norm": 0.0008707729284651577, + "learning_rate": 0.02304243433592788, + "loss": 0.028, + "num_input_tokens_seen": 29718688, + "step": 32850 + }, + { + "epoch": 8.671241916325723, + "grad_norm": 0.0021521251183003187, + "learning_rate": 0.023011072958456513, + "loss": 0.0384, + "num_input_tokens_seen": 29722944, + "step": 32855 + }, + { + "epoch": 8.67256169988122, + "grad_norm": 0.0019737184047698975, + "learning_rate": 0.022979731164273536, + "loss": 0.0174, + "num_input_tokens_seen": 29727680, + "step": 32860 + }, + { + "epoch": 8.673881483436716, + "grad_norm": 0.005185352172702551, + "learning_rate": 0.022948408958212218, + "loss": 0.0227, + "num_input_tokens_seen": 29732224, + "step": 32865 + }, + { + "epoch": 8.675201266992213, + "grad_norm": 0.0037042824551463127, + "learning_rate": 0.022917106345102876, + "loss": 0.0301, + "num_input_tokens_seen": 29736640, + "step": 32870 + }, + { + "epoch": 8.67652105054771, + "grad_norm": 0.0017883493565022945, + "learning_rate": 0.022885823329772785, + "loss": 0.0505, + "num_input_tokens_seen": 29741216, + "step": 32875 + }, + { + "epoch": 8.677840834103208, + "grad_norm": 0.000973557005636394, + "learning_rate": 0.02285455991704612, + "loss": 0.0207, + "num_input_tokens_seen": 29745984, + "step": 32880 + }, + { + "epoch": 8.679160617658704, + "grad_norm": 0.002725766971707344, + "learning_rate": 0.022823316111744117, + "loss": 0.0295, + "num_input_tokens_seen": 29750560, + "step": 32885 + }, + { + "epoch": 8.6804804012142, + "grad_norm": 0.0006026573246344924, + "learning_rate": 0.022792091918685014, + "loss": 0.0094, + "num_input_tokens_seen": 29755008, + "step": 32890 + }, + { + "epoch": 8.681800184769697, + "grad_norm": 0.0026127409655600786, + "learning_rate": 0.022760887342683906, + "loss": 0.0228, + "num_input_tokens_seen": 29759328, + "step": 32895 + }, + { + "epoch": 8.683119968325194, + "grad_norm": 0.0024339219089597464, + "learning_rate": 0.022729702388552975, + "loss": 0.0319, + "num_input_tokens_seen": 29763744, + "step": 32900 + }, + { + "epoch": 8.684439751880692, + "grad_norm": 0.001191083574667573, + "learning_rate": 0.022698537061101292, + "loss": 0.0317, + "num_input_tokens_seen": 29768416, + "step": 32905 + }, + { + "epoch": 8.685759535436189, + "grad_norm": 0.0005437489598989487, + "learning_rate": 0.022667391365134962, + "loss": 0.0307, + "num_input_tokens_seen": 29773120, + "step": 32910 + }, + { + "epoch": 8.687079318991685, + "grad_norm": 0.0016590720042586327, + "learning_rate": 0.022636265305457065, + "loss": 0.0085, + "num_input_tokens_seen": 29777440, + "step": 32915 + }, + { + "epoch": 8.688399102547182, + "grad_norm": 0.0010355908889323473, + "learning_rate": 0.02260515888686764, + "loss": 0.0206, + "num_input_tokens_seen": 29782016, + "step": 32920 + }, + { + "epoch": 8.689718886102678, + "grad_norm": 0.008421855047345161, + "learning_rate": 0.022574072114163596, + "loss": 0.0393, + "num_input_tokens_seen": 29786528, + "step": 32925 + }, + { + "epoch": 8.691038669658177, + "grad_norm": 0.0005793353775516152, + "learning_rate": 0.022543004992139005, + "loss": 0.0393, + "num_input_tokens_seen": 29791200, + "step": 32930 + }, + { + "epoch": 8.692358453213673, + "grad_norm": 0.0015635722083970904, + "learning_rate": 0.022511957525584745, + "loss": 0.0226, + "num_input_tokens_seen": 29795840, + "step": 32935 + }, + { + "epoch": 8.69367823676917, + "grad_norm": 0.0006452014204114676, + "learning_rate": 0.022480929719288778, + "loss": 0.0171, + "num_input_tokens_seen": 29800512, + "step": 32940 + }, + { + "epoch": 8.694998020324666, + "grad_norm": 0.0015080766752362251, + "learning_rate": 0.02244992157803592, + "loss": 0.0254, + "num_input_tokens_seen": 29805216, + "step": 32945 + }, + { + "epoch": 8.696317803880163, + "grad_norm": 0.00039705276140011847, + "learning_rate": 0.022418933106608047, + "loss": 0.0118, + "num_input_tokens_seen": 29809792, + "step": 32950 + }, + { + "epoch": 8.697637587435661, + "grad_norm": 0.0007109578582458198, + "learning_rate": 0.022387964309784018, + "loss": 0.067, + "num_input_tokens_seen": 29813856, + "step": 32955 + }, + { + "epoch": 8.698957370991158, + "grad_norm": 0.00018668825214263052, + "learning_rate": 0.022357015192339517, + "loss": 0.0073, + "num_input_tokens_seen": 29818400, + "step": 32960 + }, + { + "epoch": 8.700277154546654, + "grad_norm": 0.0025967450346797705, + "learning_rate": 0.02232608575904734, + "loss": 0.0653, + "num_input_tokens_seen": 29823200, + "step": 32965 + }, + { + "epoch": 8.701596938102151, + "grad_norm": 0.0003718151419889182, + "learning_rate": 0.022295176014677225, + "loss": 0.0091, + "num_input_tokens_seen": 29827456, + "step": 32970 + }, + { + "epoch": 8.702916721657648, + "grad_norm": 0.0003363052092026919, + "learning_rate": 0.02226428596399577, + "loss": 0.0278, + "num_input_tokens_seen": 29831872, + "step": 32975 + }, + { + "epoch": 8.704236505213146, + "grad_norm": 0.0029764382634311914, + "learning_rate": 0.02223341561176669, + "loss": 0.0333, + "num_input_tokens_seen": 29836640, + "step": 32980 + }, + { + "epoch": 8.705556288768642, + "grad_norm": 0.002350866561755538, + "learning_rate": 0.0222025649627505, + "loss": 0.0191, + "num_input_tokens_seen": 29841184, + "step": 32985 + }, + { + "epoch": 8.706876072324139, + "grad_norm": 0.0033979853615164757, + "learning_rate": 0.022171734021704814, + "loss": 0.0235, + "num_input_tokens_seen": 29845600, + "step": 32990 + }, + { + "epoch": 8.708195855879636, + "grad_norm": 5.8679517678683624e-05, + "learning_rate": 0.022140922793384116, + "loss": 0.0529, + "num_input_tokens_seen": 29849952, + "step": 32995 + }, + { + "epoch": 8.709515639435132, + "grad_norm": 0.003662817645817995, + "learning_rate": 0.022110131282539934, + "loss": 0.0217, + "num_input_tokens_seen": 29854336, + "step": 33000 + }, + { + "epoch": 8.709515639435132, + "eval_loss": 0.09946867823600769, + "eval_runtime": 75.8662, + "eval_samples_per_second": 88.775, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 29854336, + "step": 33000 + }, + { + "epoch": 8.71083542299063, + "grad_norm": 0.0012973578413948417, + "learning_rate": 0.022079359493920675, + "loss": 0.0126, + "num_input_tokens_seen": 29858752, + "step": 33005 + }, + { + "epoch": 8.712155206546127, + "grad_norm": 0.0005109217599965632, + "learning_rate": 0.02204860743227169, + "loss": 0.0103, + "num_input_tokens_seen": 29863264, + "step": 33010 + }, + { + "epoch": 8.713474990101624, + "grad_norm": 0.0023262740578502417, + "learning_rate": 0.022017875102335365, + "loss": 0.0142, + "num_input_tokens_seen": 29867744, + "step": 33015 + }, + { + "epoch": 8.71479477365712, + "grad_norm": 0.0023195354733616114, + "learning_rate": 0.02198716250885108, + "loss": 0.0303, + "num_input_tokens_seen": 29872736, + "step": 33020 + }, + { + "epoch": 8.716114557212617, + "grad_norm": 0.0024182689376175404, + "learning_rate": 0.021956469656555, + "loss": 0.0496, + "num_input_tokens_seen": 29877344, + "step": 33025 + }, + { + "epoch": 8.717434340768115, + "grad_norm": 0.0005016459617763758, + "learning_rate": 0.0219257965501804, + "loss": 0.0197, + "num_input_tokens_seen": 29881984, + "step": 33030 + }, + { + "epoch": 8.718754124323612, + "grad_norm": 0.0004883314250037074, + "learning_rate": 0.021895143194457494, + "loss": 0.0144, + "num_input_tokens_seen": 29886784, + "step": 33035 + }, + { + "epoch": 8.720073907879108, + "grad_norm": 0.004469424951821566, + "learning_rate": 0.021864509594113322, + "loss": 0.0334, + "num_input_tokens_seen": 29891392, + "step": 33040 + }, + { + "epoch": 8.721393691434605, + "grad_norm": 0.003942309878766537, + "learning_rate": 0.02183389575387207, + "loss": 0.0175, + "num_input_tokens_seen": 29895936, + "step": 33045 + }, + { + "epoch": 8.722713474990101, + "grad_norm": 0.0011210638331249356, + "learning_rate": 0.021803301678454682, + "loss": 0.0325, + "num_input_tokens_seen": 29900384, + "step": 33050 + }, + { + "epoch": 8.724033258545598, + "grad_norm": 0.000595323508605361, + "learning_rate": 0.021772727372579213, + "loss": 0.0181, + "num_input_tokens_seen": 29905024, + "step": 33055 + }, + { + "epoch": 8.725353042101096, + "grad_norm": 0.0009720342350192368, + "learning_rate": 0.02174217284096061, + "loss": 0.0123, + "num_input_tokens_seen": 29909952, + "step": 33060 + }, + { + "epoch": 8.726672825656593, + "grad_norm": 0.0020203583408147097, + "learning_rate": 0.0217116380883107, + "loss": 0.0271, + "num_input_tokens_seen": 29914208, + "step": 33065 + }, + { + "epoch": 8.72799260921209, + "grad_norm": 0.001229443703778088, + "learning_rate": 0.021681123119338425, + "loss": 0.0383, + "num_input_tokens_seen": 29918400, + "step": 33070 + }, + { + "epoch": 8.729312392767586, + "grad_norm": 0.0001726663758745417, + "learning_rate": 0.02165062793874951, + "loss": 0.0082, + "num_input_tokens_seen": 29922784, + "step": 33075 + }, + { + "epoch": 8.730632176323082, + "grad_norm": 0.0006002418231219053, + "learning_rate": 0.021620152551246666, + "loss": 0.0469, + "num_input_tokens_seen": 29927360, + "step": 33080 + }, + { + "epoch": 8.73195195987858, + "grad_norm": 0.0007391213439404964, + "learning_rate": 0.02158969696152967, + "loss": 0.0147, + "num_input_tokens_seen": 29931776, + "step": 33085 + }, + { + "epoch": 8.733271743434077, + "grad_norm": 0.004015195183455944, + "learning_rate": 0.021559261174295057, + "loss": 0.0261, + "num_input_tokens_seen": 29936608, + "step": 33090 + }, + { + "epoch": 8.734591526989574, + "grad_norm": 0.000872997276019305, + "learning_rate": 0.02152884519423646, + "loss": 0.0118, + "num_input_tokens_seen": 29940896, + "step": 33095 + }, + { + "epoch": 8.73591131054507, + "grad_norm": 0.001537068048492074, + "learning_rate": 0.021498449026044447, + "loss": 0.0135, + "num_input_tokens_seen": 29945760, + "step": 33100 + }, + { + "epoch": 8.737231094100567, + "grad_norm": 0.003231001552194357, + "learning_rate": 0.021468072674406414, + "loss": 0.0382, + "num_input_tokens_seen": 29950080, + "step": 33105 + }, + { + "epoch": 8.738550877656065, + "grad_norm": 0.0026522576808929443, + "learning_rate": 0.021437716144006795, + "loss": 0.0211, + "num_input_tokens_seen": 29954656, + "step": 33110 + }, + { + "epoch": 8.739870661211562, + "grad_norm": 0.003049728460609913, + "learning_rate": 0.021407379439527002, + "loss": 0.0214, + "num_input_tokens_seen": 29959712, + "step": 33115 + }, + { + "epoch": 8.741190444767058, + "grad_norm": 0.0011457595974206924, + "learning_rate": 0.021377062565645255, + "loss": 0.0419, + "num_input_tokens_seen": 29964320, + "step": 33120 + }, + { + "epoch": 8.742510228322555, + "grad_norm": 0.004876948427408934, + "learning_rate": 0.02134676552703688, + "loss": 0.041, + "num_input_tokens_seen": 29968864, + "step": 33125 + }, + { + "epoch": 8.743830011878051, + "grad_norm": 0.0021507535129785538, + "learning_rate": 0.02131648832837398, + "loss": 0.0399, + "num_input_tokens_seen": 29973440, + "step": 33130 + }, + { + "epoch": 8.745149795433548, + "grad_norm": 0.00048102144501172006, + "learning_rate": 0.02128623097432574, + "loss": 0.0422, + "num_input_tokens_seen": 29978016, + "step": 33135 + }, + { + "epoch": 8.746469578989046, + "grad_norm": 0.002616821089759469, + "learning_rate": 0.021255993469558192, + "loss": 0.0263, + "num_input_tokens_seen": 29982816, + "step": 33140 + }, + { + "epoch": 8.747789362544543, + "grad_norm": 0.0007084730896167457, + "learning_rate": 0.021225775818734364, + "loss": 0.0172, + "num_input_tokens_seen": 29987424, + "step": 33145 + }, + { + "epoch": 8.74910914610004, + "grad_norm": 0.0020688907243311405, + "learning_rate": 0.021195578026514166, + "loss": 0.0325, + "num_input_tokens_seen": 29992096, + "step": 33150 + }, + { + "epoch": 8.750428929655536, + "grad_norm": 0.0037577711045742035, + "learning_rate": 0.02116540009755452, + "loss": 0.0392, + "num_input_tokens_seen": 29996768, + "step": 33155 + }, + { + "epoch": 8.751748713211033, + "grad_norm": 0.0027717535849660635, + "learning_rate": 0.021135242036509173, + "loss": 0.013, + "num_input_tokens_seen": 30001344, + "step": 33160 + }, + { + "epoch": 8.75306849676653, + "grad_norm": 0.003105464391410351, + "learning_rate": 0.021105103848028967, + "loss": 0.0423, + "num_input_tokens_seen": 30005792, + "step": 33165 + }, + { + "epoch": 8.754388280322027, + "grad_norm": 0.0011349882697686553, + "learning_rate": 0.021074985536761504, + "loss": 0.0175, + "num_input_tokens_seen": 30010016, + "step": 33170 + }, + { + "epoch": 8.755708063877524, + "grad_norm": 0.004328377544879913, + "learning_rate": 0.021044887107351435, + "loss": 0.026, + "num_input_tokens_seen": 30014400, + "step": 33175 + }, + { + "epoch": 8.75702784743302, + "grad_norm": 0.0037692945916205645, + "learning_rate": 0.021014808564440362, + "loss": 0.0401, + "num_input_tokens_seen": 30018624, + "step": 33180 + }, + { + "epoch": 8.758347630988517, + "grad_norm": 0.002668027998879552, + "learning_rate": 0.02098474991266671, + "loss": 0.0115, + "num_input_tokens_seen": 30023264, + "step": 33185 + }, + { + "epoch": 8.759667414544015, + "grad_norm": 0.0002114187809638679, + "learning_rate": 0.02095471115666592, + "loss": 0.0199, + "num_input_tokens_seen": 30027808, + "step": 33190 + }, + { + "epoch": 8.760987198099512, + "grad_norm": 0.0017969065811485052, + "learning_rate": 0.020924692301070406, + "loss": 0.0358, + "num_input_tokens_seen": 30032128, + "step": 33195 + }, + { + "epoch": 8.762306981655009, + "grad_norm": 0.00045332495938055217, + "learning_rate": 0.020894693350509346, + "loss": 0.0126, + "num_input_tokens_seen": 30036448, + "step": 33200 + }, + { + "epoch": 8.762306981655009, + "eval_loss": 0.10028576105833054, + "eval_runtime": 75.8928, + "eval_samples_per_second": 88.744, + "eval_steps_per_second": 22.189, + "num_input_tokens_seen": 30036448, + "step": 33200 + }, + { + "epoch": 8.763626765210505, + "grad_norm": 0.0012283313553780317, + "learning_rate": 0.020864714309609057, + "loss": 0.0681, + "num_input_tokens_seen": 30040896, + "step": 33205 + }, + { + "epoch": 8.764946548766002, + "grad_norm": 0.0036809309385716915, + "learning_rate": 0.020834755182992604, + "loss": 0.0199, + "num_input_tokens_seen": 30045312, + "step": 33210 + }, + { + "epoch": 8.7662663323215, + "grad_norm": 0.0019607008434832096, + "learning_rate": 0.02080481597528011, + "loss": 0.0621, + "num_input_tokens_seen": 30049632, + "step": 33215 + }, + { + "epoch": 8.767586115876997, + "grad_norm": 0.0012640284840017557, + "learning_rate": 0.020774896691088583, + "loss": 0.0712, + "num_input_tokens_seen": 30054112, + "step": 33220 + }, + { + "epoch": 8.768905899432493, + "grad_norm": 0.0067563909105956554, + "learning_rate": 0.020744997335031882, + "loss": 0.0201, + "num_input_tokens_seen": 30059040, + "step": 33225 + }, + { + "epoch": 8.77022568298799, + "grad_norm": 0.00014466297579929233, + "learning_rate": 0.02071511791172092, + "loss": 0.0082, + "num_input_tokens_seen": 30063488, + "step": 33230 + }, + { + "epoch": 8.771545466543486, + "grad_norm": 0.0004925979301333427, + "learning_rate": 0.02068525842576351, + "loss": 0.0159, + "num_input_tokens_seen": 30068032, + "step": 33235 + }, + { + "epoch": 8.772865250098985, + "grad_norm": 0.002483692253008485, + "learning_rate": 0.020655418881764264, + "loss": 0.0413, + "num_input_tokens_seen": 30072512, + "step": 33240 + }, + { + "epoch": 8.774185033654481, + "grad_norm": 0.0005271092522889376, + "learning_rate": 0.020625599284324923, + "loss": 0.0106, + "num_input_tokens_seen": 30077184, + "step": 33245 + }, + { + "epoch": 8.775504817209978, + "grad_norm": 0.002154351444914937, + "learning_rate": 0.02059579963804396, + "loss": 0.0274, + "num_input_tokens_seen": 30081792, + "step": 33250 + }, + { + "epoch": 8.776824600765474, + "grad_norm": 0.0010753627866506577, + "learning_rate": 0.02056601994751688, + "loss": 0.0367, + "num_input_tokens_seen": 30086240, + "step": 33255 + }, + { + "epoch": 8.77814438432097, + "grad_norm": 0.00032478244975209236, + "learning_rate": 0.02053626021733614, + "loss": 0.0168, + "num_input_tokens_seen": 30090720, + "step": 33260 + }, + { + "epoch": 8.779464167876469, + "grad_norm": 0.00030969592626206577, + "learning_rate": 0.02050652045209097, + "loss": 0.0148, + "num_input_tokens_seen": 30095168, + "step": 33265 + }, + { + "epoch": 8.780783951431966, + "grad_norm": 0.0007246932364068925, + "learning_rate": 0.020476800656367672, + "loss": 0.0069, + "num_input_tokens_seen": 30099680, + "step": 33270 + }, + { + "epoch": 8.782103734987462, + "grad_norm": 0.0017825770191848278, + "learning_rate": 0.020447100834749425, + "loss": 0.0319, + "num_input_tokens_seen": 30104064, + "step": 33275 + }, + { + "epoch": 8.783423518542959, + "grad_norm": 0.0022528781555593014, + "learning_rate": 0.02041742099181627, + "loss": 0.0352, + "num_input_tokens_seen": 30108576, + "step": 33280 + }, + { + "epoch": 8.784743302098455, + "grad_norm": 0.0033799291122704744, + "learning_rate": 0.02038776113214526, + "loss": 0.0403, + "num_input_tokens_seen": 30113312, + "step": 33285 + }, + { + "epoch": 8.786063085653954, + "grad_norm": 0.004488206934183836, + "learning_rate": 0.0203581212603103, + "loss": 0.0145, + "num_input_tokens_seen": 30117824, + "step": 33290 + }, + { + "epoch": 8.78738286920945, + "grad_norm": 0.0008114763186313212, + "learning_rate": 0.02032850138088219, + "loss": 0.0178, + "num_input_tokens_seen": 30122272, + "step": 33295 + }, + { + "epoch": 8.788702652764947, + "grad_norm": 0.0005373743479140103, + "learning_rate": 0.020298901498428754, + "loss": 0.0136, + "num_input_tokens_seen": 30126624, + "step": 33300 + }, + { + "epoch": 8.790022436320443, + "grad_norm": 0.0002855401544366032, + "learning_rate": 0.020269321617514595, + "loss": 0.0279, + "num_input_tokens_seen": 30130912, + "step": 33305 + }, + { + "epoch": 8.79134221987594, + "grad_norm": 0.0038888168055564165, + "learning_rate": 0.020239761742701343, + "loss": 0.0321, + "num_input_tokens_seen": 30135456, + "step": 33310 + }, + { + "epoch": 8.792662003431436, + "grad_norm": 0.0006087786750867963, + "learning_rate": 0.02021022187854754, + "loss": 0.0121, + "num_input_tokens_seen": 30139872, + "step": 33315 + }, + { + "epoch": 8.793981786986935, + "grad_norm": 0.00020582599972840399, + "learning_rate": 0.020180702029608522, + "loss": 0.0156, + "num_input_tokens_seen": 30144384, + "step": 33320 + }, + { + "epoch": 8.795301570542431, + "grad_norm": 0.00088780332589522, + "learning_rate": 0.020151202200436695, + "loss": 0.014, + "num_input_tokens_seen": 30148864, + "step": 33325 + }, + { + "epoch": 8.796621354097928, + "grad_norm": 0.0015813792124390602, + "learning_rate": 0.020121722395581226, + "loss": 0.0184, + "num_input_tokens_seen": 30153440, + "step": 33330 + }, + { + "epoch": 8.797941137653424, + "grad_norm": 0.00446562422439456, + "learning_rate": 0.020092262619588342, + "loss": 0.0144, + "num_input_tokens_seen": 30157728, + "step": 33335 + }, + { + "epoch": 8.799260921208921, + "grad_norm": 0.0015869222115725279, + "learning_rate": 0.02006282287700109, + "loss": 0.0113, + "num_input_tokens_seen": 30162048, + "step": 33340 + }, + { + "epoch": 8.80058070476442, + "grad_norm": 0.004411677364259958, + "learning_rate": 0.020033403172359427, + "loss": 0.0161, + "num_input_tokens_seen": 30166880, + "step": 33345 + }, + { + "epoch": 8.801900488319916, + "grad_norm": 0.0014797732001170516, + "learning_rate": 0.020004003510200284, + "loss": 0.0275, + "num_input_tokens_seen": 30171488, + "step": 33350 + }, + { + "epoch": 8.803220271875412, + "grad_norm": 0.0005591905792243779, + "learning_rate": 0.019974623895057407, + "loss": 0.0336, + "num_input_tokens_seen": 30176192, + "step": 33355 + }, + { + "epoch": 8.804540055430909, + "grad_norm": 0.003393767634406686, + "learning_rate": 0.019945264331461553, + "loss": 0.031, + "num_input_tokens_seen": 30180544, + "step": 33360 + }, + { + "epoch": 8.805859838986406, + "grad_norm": 0.002604442648589611, + "learning_rate": 0.019915924823940317, + "loss": 0.0162, + "num_input_tokens_seen": 30185024, + "step": 33365 + }, + { + "epoch": 8.807179622541904, + "grad_norm": 0.0010032159043475986, + "learning_rate": 0.01988660537701816, + "loss": 0.0185, + "num_input_tokens_seen": 30189664, + "step": 33370 + }, + { + "epoch": 8.8084994060974, + "grad_norm": 0.003131004748865962, + "learning_rate": 0.01985730599521659, + "loss": 0.0988, + "num_input_tokens_seen": 30193888, + "step": 33375 + }, + { + "epoch": 8.809819189652897, + "grad_norm": 0.0005529894842766225, + "learning_rate": 0.019828026683053918, + "loss": 0.0235, + "num_input_tokens_seen": 30198336, + "step": 33380 + }, + { + "epoch": 8.811138973208394, + "grad_norm": 0.0054586948826909065, + "learning_rate": 0.01979876744504535, + "loss": 0.0255, + "num_input_tokens_seen": 30203200, + "step": 33385 + }, + { + "epoch": 8.81245875676389, + "grad_norm": 0.001113108010031283, + "learning_rate": 0.019769528285703046, + "loss": 0.0145, + "num_input_tokens_seen": 30207840, + "step": 33390 + }, + { + "epoch": 8.813778540319387, + "grad_norm": 0.003579467535018921, + "learning_rate": 0.019740309209536098, + "loss": 0.0253, + "num_input_tokens_seen": 30212448, + "step": 33395 + }, + { + "epoch": 8.815098323874885, + "grad_norm": 0.003721013432368636, + "learning_rate": 0.019711110221050387, + "loss": 0.0088, + "num_input_tokens_seen": 30216896, + "step": 33400 + }, + { + "epoch": 8.815098323874885, + "eval_loss": 0.09928131103515625, + "eval_runtime": 75.8117, + "eval_samples_per_second": 88.839, + "eval_steps_per_second": 22.213, + "num_input_tokens_seen": 30216896, + "step": 33400 + }, + { + "epoch": 8.816418107430382, + "grad_norm": 0.004968307912349701, + "learning_rate": 0.019681931324748825, + "loss": 0.0452, + "num_input_tokens_seen": 30221344, + "step": 33405 + }, + { + "epoch": 8.817737890985878, + "grad_norm": 0.0016890099504962564, + "learning_rate": 0.019652772525131094, + "loss": 0.0275, + "num_input_tokens_seen": 30225600, + "step": 33410 + }, + { + "epoch": 8.819057674541375, + "grad_norm": 0.000835006998386234, + "learning_rate": 0.019623633826693885, + "loss": 0.0191, + "num_input_tokens_seen": 30230112, + "step": 33415 + }, + { + "epoch": 8.820377458096871, + "grad_norm": 0.0047614541836082935, + "learning_rate": 0.019594515233930788, + "loss": 0.0367, + "num_input_tokens_seen": 30234752, + "step": 33420 + }, + { + "epoch": 8.82169724165237, + "grad_norm": 0.0028927968814969063, + "learning_rate": 0.019565416751332186, + "loss": 0.0201, + "num_input_tokens_seen": 30238976, + "step": 33425 + }, + { + "epoch": 8.823017025207866, + "grad_norm": 0.001167802605777979, + "learning_rate": 0.019536338383385497, + "loss": 0.0276, + "num_input_tokens_seen": 30243328, + "step": 33430 + }, + { + "epoch": 8.824336808763363, + "grad_norm": 0.0009749136515893042, + "learning_rate": 0.019507280134574933, + "loss": 0.0195, + "num_input_tokens_seen": 30248064, + "step": 33435 + }, + { + "epoch": 8.82565659231886, + "grad_norm": 0.0014106082962825894, + "learning_rate": 0.019478242009381624, + "loss": 0.009, + "num_input_tokens_seen": 30252800, + "step": 33440 + }, + { + "epoch": 8.826976375874356, + "grad_norm": 0.0003862427838612348, + "learning_rate": 0.01944922401228367, + "loss": 0.0204, + "num_input_tokens_seen": 30257664, + "step": 33445 + }, + { + "epoch": 8.828296159429854, + "grad_norm": 0.0011513722129166126, + "learning_rate": 0.01942022614775593, + "loss": 0.0153, + "num_input_tokens_seen": 30262432, + "step": 33450 + }, + { + "epoch": 8.82961594298535, + "grad_norm": 0.00017152645159512758, + "learning_rate": 0.01939124842027029, + "loss": 0.0199, + "num_input_tokens_seen": 30267008, + "step": 33455 + }, + { + "epoch": 8.830935726540847, + "grad_norm": 0.0013488342519849539, + "learning_rate": 0.01936229083429551, + "loss": 0.0278, + "num_input_tokens_seen": 30271392, + "step": 33460 + }, + { + "epoch": 8.832255510096344, + "grad_norm": 0.0020035270135849714, + "learning_rate": 0.019333353394297148, + "loss": 0.0252, + "num_input_tokens_seen": 30275840, + "step": 33465 + }, + { + "epoch": 8.83357529365184, + "grad_norm": 0.00048446148866787553, + "learning_rate": 0.019304436104737754, + "loss": 0.0196, + "num_input_tokens_seen": 30280096, + "step": 33470 + }, + { + "epoch": 8.834895077207339, + "grad_norm": 0.0009701112285256386, + "learning_rate": 0.019275538970076778, + "loss": 0.0155, + "num_input_tokens_seen": 30284352, + "step": 33475 + }, + { + "epoch": 8.836214860762835, + "grad_norm": 0.003915614914149046, + "learning_rate": 0.019246661994770434, + "loss": 0.0093, + "num_input_tokens_seen": 30288512, + "step": 33480 + }, + { + "epoch": 8.837534644318332, + "grad_norm": 0.0016452838899567723, + "learning_rate": 0.019217805183271985, + "loss": 0.018, + "num_input_tokens_seen": 30292640, + "step": 33485 + }, + { + "epoch": 8.838854427873828, + "grad_norm": 0.00720234727486968, + "learning_rate": 0.019188968540031465, + "loss": 0.0603, + "num_input_tokens_seen": 30296832, + "step": 33490 + }, + { + "epoch": 8.840174211429325, + "grad_norm": 0.003924037795513868, + "learning_rate": 0.019160152069495867, + "loss": 0.0431, + "num_input_tokens_seen": 30301344, + "step": 33495 + }, + { + "epoch": 8.841493994984823, + "grad_norm": 0.002952153328806162, + "learning_rate": 0.019131355776109103, + "loss": 0.0148, + "num_input_tokens_seen": 30306304, + "step": 33500 + }, + { + "epoch": 8.84281377854032, + "grad_norm": 0.004598728381097317, + "learning_rate": 0.019102579664311857, + "loss": 0.0517, + "num_input_tokens_seen": 30310720, + "step": 33505 + }, + { + "epoch": 8.844133562095816, + "grad_norm": 0.0004017157480120659, + "learning_rate": 0.019073823738541763, + "loss": 0.0068, + "num_input_tokens_seen": 30315264, + "step": 33510 + }, + { + "epoch": 8.845453345651313, + "grad_norm": 0.00162198964972049, + "learning_rate": 0.0190450880032334, + "loss": 0.04, + "num_input_tokens_seen": 30319744, + "step": 33515 + }, + { + "epoch": 8.84677312920681, + "grad_norm": 0.0031987298280000687, + "learning_rate": 0.019016372462818114, + "loss": 0.0206, + "num_input_tokens_seen": 30324128, + "step": 33520 + }, + { + "epoch": 8.848092912762308, + "grad_norm": 0.0013884580694139004, + "learning_rate": 0.018987677121724278, + "loss": 0.0138, + "num_input_tokens_seen": 30329184, + "step": 33525 + }, + { + "epoch": 8.849412696317804, + "grad_norm": 0.0019547082483768463, + "learning_rate": 0.018959001984377, + "loss": 0.0247, + "num_input_tokens_seen": 30333760, + "step": 33530 + }, + { + "epoch": 8.850732479873301, + "grad_norm": 0.002746395068243146, + "learning_rate": 0.018930347055198377, + "loss": 0.0213, + "num_input_tokens_seen": 30338016, + "step": 33535 + }, + { + "epoch": 8.852052263428797, + "grad_norm": 0.0003104577772319317, + "learning_rate": 0.01890171233860739, + "loss": 0.0315, + "num_input_tokens_seen": 30342560, + "step": 33540 + }, + { + "epoch": 8.853372046984294, + "grad_norm": 0.0015097034629434347, + "learning_rate": 0.018873097839019807, + "loss": 0.0373, + "num_input_tokens_seen": 30347296, + "step": 33545 + }, + { + "epoch": 8.854691830539792, + "grad_norm": 0.0004650457703974098, + "learning_rate": 0.0188445035608484, + "loss": 0.0236, + "num_input_tokens_seen": 30351776, + "step": 33550 + }, + { + "epoch": 8.856011614095289, + "grad_norm": 0.00021452787041198462, + "learning_rate": 0.018815929508502777, + "loss": 0.0361, + "num_input_tokens_seen": 30355968, + "step": 33555 + }, + { + "epoch": 8.857331397650785, + "grad_norm": 0.0005208867369219661, + "learning_rate": 0.01878737568638934, + "loss": 0.0092, + "num_input_tokens_seen": 30360448, + "step": 33560 + }, + { + "epoch": 8.858651181206282, + "grad_norm": 0.0011056015500798821, + "learning_rate": 0.01875884209891152, + "loss": 0.0092, + "num_input_tokens_seen": 30364832, + "step": 33565 + }, + { + "epoch": 8.859970964761779, + "grad_norm": 0.001227535423822701, + "learning_rate": 0.018730328750469514, + "loss": 0.0169, + "num_input_tokens_seen": 30369536, + "step": 33570 + }, + { + "epoch": 8.861290748317275, + "grad_norm": 0.000291031930828467, + "learning_rate": 0.018701835645460473, + "loss": 0.0198, + "num_input_tokens_seen": 30374272, + "step": 33575 + }, + { + "epoch": 8.862610531872773, + "grad_norm": 0.0016448143869638443, + "learning_rate": 0.01867336278827838, + "loss": 0.0203, + "num_input_tokens_seen": 30378688, + "step": 33580 + }, + { + "epoch": 8.86393031542827, + "grad_norm": 0.001959532964974642, + "learning_rate": 0.018644910183314056, + "loss": 0.0332, + "num_input_tokens_seen": 30383296, + "step": 33585 + }, + { + "epoch": 8.865250098983767, + "grad_norm": 0.0011246484937146306, + "learning_rate": 0.01861647783495531, + "loss": 0.0159, + "num_input_tokens_seen": 30387936, + "step": 33590 + }, + { + "epoch": 8.866569882539263, + "grad_norm": 0.0012039266293868423, + "learning_rate": 0.01858806574758676, + "loss": 0.0169, + "num_input_tokens_seen": 30392384, + "step": 33595 + }, + { + "epoch": 8.86788966609476, + "grad_norm": 0.00159428920596838, + "learning_rate": 0.01855967392558988, + "loss": 0.0306, + "num_input_tokens_seen": 30396960, + "step": 33600 + }, + { + "epoch": 8.86788966609476, + "eval_loss": 0.10097994655370712, + "eval_runtime": 75.8653, + "eval_samples_per_second": 88.776, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 30396960, + "step": 33600 + }, + { + "epoch": 8.869209449650258, + "grad_norm": 0.001665836782194674, + "learning_rate": 0.018531302373343096, + "loss": 0.0085, + "num_input_tokens_seen": 30401312, + "step": 33605 + }, + { + "epoch": 8.870529233205755, + "grad_norm": 0.0019384130137041211, + "learning_rate": 0.018502951095221588, + "loss": 0.0201, + "num_input_tokens_seen": 30405728, + "step": 33610 + }, + { + "epoch": 8.871849016761251, + "grad_norm": 0.002155065769329667, + "learning_rate": 0.01847462009559751, + "loss": 0.0535, + "num_input_tokens_seen": 30410208, + "step": 33615 + }, + { + "epoch": 8.873168800316748, + "grad_norm": 0.0038871928118169308, + "learning_rate": 0.01844630937883992, + "loss": 0.0494, + "num_input_tokens_seen": 30414624, + "step": 33620 + }, + { + "epoch": 8.874488583872244, + "grad_norm": 0.0008440698729828, + "learning_rate": 0.018418018949314573, + "loss": 0.014, + "num_input_tokens_seen": 30419072, + "step": 33625 + }, + { + "epoch": 8.875808367427743, + "grad_norm": 0.0014562037540599704, + "learning_rate": 0.018389748811384315, + "loss": 0.0261, + "num_input_tokens_seen": 30423584, + "step": 33630 + }, + { + "epoch": 8.87712815098324, + "grad_norm": 0.0024776689242571592, + "learning_rate": 0.018361498969408658, + "loss": 0.0259, + "num_input_tokens_seen": 30427712, + "step": 33635 + }, + { + "epoch": 8.878447934538736, + "grad_norm": 0.005839281249791384, + "learning_rate": 0.01833326942774415, + "loss": 0.0289, + "num_input_tokens_seen": 30432192, + "step": 33640 + }, + { + "epoch": 8.879767718094232, + "grad_norm": 0.0002599732833914459, + "learning_rate": 0.018305060190744155, + "loss": 0.0098, + "num_input_tokens_seen": 30436480, + "step": 33645 + }, + { + "epoch": 8.881087501649729, + "grad_norm": 0.005658195819705725, + "learning_rate": 0.018276871262758846, + "loss": 0.0454, + "num_input_tokens_seen": 30440800, + "step": 33650 + }, + { + "epoch": 8.882407285205227, + "grad_norm": 0.004128952044993639, + "learning_rate": 0.0182487026481353, + "loss": 0.0323, + "num_input_tokens_seen": 30445056, + "step": 33655 + }, + { + "epoch": 8.883727068760724, + "grad_norm": 0.0015373299829661846, + "learning_rate": 0.018220554351217538, + "loss": 0.0354, + "num_input_tokens_seen": 30449440, + "step": 33660 + }, + { + "epoch": 8.88504685231622, + "grad_norm": 0.0009668247075751424, + "learning_rate": 0.01819242637634629, + "loss": 0.0643, + "num_input_tokens_seen": 30454016, + "step": 33665 + }, + { + "epoch": 8.886366635871717, + "grad_norm": 0.003121131332591176, + "learning_rate": 0.01816431872785933, + "loss": 0.0122, + "num_input_tokens_seen": 30458880, + "step": 33670 + }, + { + "epoch": 8.887686419427213, + "grad_norm": 0.0033118987921625376, + "learning_rate": 0.018136231410091148, + "loss": 0.0201, + "num_input_tokens_seen": 30463360, + "step": 33675 + }, + { + "epoch": 8.88900620298271, + "grad_norm": 0.0003784664731938392, + "learning_rate": 0.018108164427373175, + "loss": 0.0117, + "num_input_tokens_seen": 30467616, + "step": 33680 + }, + { + "epoch": 8.890325986538208, + "grad_norm": 0.002725557889789343, + "learning_rate": 0.01808011778403375, + "loss": 0.0121, + "num_input_tokens_seen": 30471872, + "step": 33685 + }, + { + "epoch": 8.891645770093705, + "grad_norm": 0.00028278876561671495, + "learning_rate": 0.01805209148439793, + "loss": 0.0092, + "num_input_tokens_seen": 30476672, + "step": 33690 + }, + { + "epoch": 8.892965553649201, + "grad_norm": 0.00018713339522946626, + "learning_rate": 0.018024085532787757, + "loss": 0.0038, + "num_input_tokens_seen": 30481024, + "step": 33695 + }, + { + "epoch": 8.894285337204698, + "grad_norm": 0.001213845331221819, + "learning_rate": 0.017996099933522164, + "loss": 0.0259, + "num_input_tokens_seen": 30485312, + "step": 33700 + }, + { + "epoch": 8.895605120760194, + "grad_norm": 0.0031204011756926775, + "learning_rate": 0.017968134690916775, + "loss": 0.0185, + "num_input_tokens_seen": 30489600, + "step": 33705 + }, + { + "epoch": 8.896924904315693, + "grad_norm": 0.003683362854644656, + "learning_rate": 0.017940189809284263, + "loss": 0.0434, + "num_input_tokens_seen": 30494048, + "step": 33710 + }, + { + "epoch": 8.89824468787119, + "grad_norm": 0.004639550112187862, + "learning_rate": 0.017912265292934024, + "loss": 0.0431, + "num_input_tokens_seen": 30498656, + "step": 33715 + }, + { + "epoch": 8.899564471426686, + "grad_norm": 0.002104710089042783, + "learning_rate": 0.017884361146172423, + "loss": 0.0171, + "num_input_tokens_seen": 30503264, + "step": 33720 + }, + { + "epoch": 8.900884254982182, + "grad_norm": 0.0016959805507212877, + "learning_rate": 0.01785647737330261, + "loss": 0.0111, + "num_input_tokens_seen": 30507936, + "step": 33725 + }, + { + "epoch": 8.902204038537679, + "grad_norm": 0.004113081842660904, + "learning_rate": 0.017828613978624563, + "loss": 0.0414, + "num_input_tokens_seen": 30512736, + "step": 33730 + }, + { + "epoch": 8.903523822093177, + "grad_norm": 0.0017653262475505471, + "learning_rate": 0.01780077096643523, + "loss": 0.0315, + "num_input_tokens_seen": 30517440, + "step": 33735 + }, + { + "epoch": 8.904843605648674, + "grad_norm": 0.004712948575615883, + "learning_rate": 0.017772948341028345, + "loss": 0.0294, + "num_input_tokens_seen": 30522016, + "step": 33740 + }, + { + "epoch": 8.90616338920417, + "grad_norm": 0.0022486289963126183, + "learning_rate": 0.01774514610669447, + "loss": 0.0321, + "num_input_tokens_seen": 30526656, + "step": 33745 + }, + { + "epoch": 8.907483172759667, + "grad_norm": 0.0004066124092787504, + "learning_rate": 0.017717364267721112, + "loss": 0.0153, + "num_input_tokens_seen": 30531296, + "step": 33750 + }, + { + "epoch": 8.908802956315164, + "grad_norm": 0.003261430189013481, + "learning_rate": 0.017689602828392513, + "loss": 0.0245, + "num_input_tokens_seen": 30536000, + "step": 33755 + }, + { + "epoch": 8.910122739870662, + "grad_norm": 0.00306550320237875, + "learning_rate": 0.017661861792989897, + "loss": 0.0202, + "num_input_tokens_seen": 30540512, + "step": 33760 + }, + { + "epoch": 8.911442523426159, + "grad_norm": 0.0019703444559127092, + "learning_rate": 0.017634141165791272, + "loss": 0.0256, + "num_input_tokens_seen": 30544928, + "step": 33765 + }, + { + "epoch": 8.912762306981655, + "grad_norm": 0.00366991083137691, + "learning_rate": 0.017606440951071455, + "loss": 0.035, + "num_input_tokens_seen": 30549568, + "step": 33770 + }, + { + "epoch": 8.914082090537152, + "grad_norm": 0.0012588363606482744, + "learning_rate": 0.017578761153102213, + "loss": 0.0338, + "num_input_tokens_seen": 30554048, + "step": 33775 + }, + { + "epoch": 8.915401874092648, + "grad_norm": 0.0003320384712424129, + "learning_rate": 0.017551101776152146, + "loss": 0.0348, + "num_input_tokens_seen": 30558784, + "step": 33780 + }, + { + "epoch": 8.916721657648147, + "grad_norm": 0.000906769186258316, + "learning_rate": 0.017523462824486608, + "loss": 0.0298, + "num_input_tokens_seen": 30563456, + "step": 33785 + }, + { + "epoch": 8.918041441203643, + "grad_norm": 0.0010178614174947143, + "learning_rate": 0.01749584430236794, + "loss": 0.0276, + "num_input_tokens_seen": 30568224, + "step": 33790 + }, + { + "epoch": 8.91936122475914, + "grad_norm": 0.0012388717150315642, + "learning_rate": 0.01746824621405524, + "loss": 0.023, + "num_input_tokens_seen": 30572544, + "step": 33795 + }, + { + "epoch": 8.920681008314636, + "grad_norm": 0.0017207140335813165, + "learning_rate": 0.017440668563804412, + "loss": 0.0189, + "num_input_tokens_seen": 30576960, + "step": 33800 + }, + { + "epoch": 8.920681008314636, + "eval_loss": 0.10127760469913483, + "eval_runtime": 75.8785, + "eval_samples_per_second": 88.76, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 30576960, + "step": 33800 + }, + { + "epoch": 8.922000791870133, + "grad_norm": 0.0005471634794957936, + "learning_rate": 0.017413111355868392, + "loss": 0.0116, + "num_input_tokens_seen": 30581344, + "step": 33805 + }, + { + "epoch": 8.923320575425631, + "grad_norm": 0.005048595834523439, + "learning_rate": 0.017385574594496748, + "loss": 0.0198, + "num_input_tokens_seen": 30586144, + "step": 33810 + }, + { + "epoch": 8.924640358981128, + "grad_norm": 0.003784047206863761, + "learning_rate": 0.01735805828393605, + "loss": 0.0331, + "num_input_tokens_seen": 30590720, + "step": 33815 + }, + { + "epoch": 8.925960142536624, + "grad_norm": 0.00278626405633986, + "learning_rate": 0.017330562428429667, + "loss": 0.0414, + "num_input_tokens_seen": 30595264, + "step": 33820 + }, + { + "epoch": 8.92727992609212, + "grad_norm": 0.0027536582201719284, + "learning_rate": 0.01730308703221776, + "loss": 0.0401, + "num_input_tokens_seen": 30599776, + "step": 33825 + }, + { + "epoch": 8.928599709647617, + "grad_norm": 0.004082702565938234, + "learning_rate": 0.01727563209953744, + "loss": 0.0127, + "num_input_tokens_seen": 30604480, + "step": 33830 + }, + { + "epoch": 8.929919493203116, + "grad_norm": 0.002086843363940716, + "learning_rate": 0.017248197634622535, + "loss": 0.0391, + "num_input_tokens_seen": 30609056, + "step": 33835 + }, + { + "epoch": 8.931239276758612, + "grad_norm": 0.00191531574819237, + "learning_rate": 0.01722078364170383, + "loss": 0.0202, + "num_input_tokens_seen": 30613728, + "step": 33840 + }, + { + "epoch": 8.932559060314109, + "grad_norm": 0.001797012286260724, + "learning_rate": 0.017193390125008905, + "loss": 0.0277, + "num_input_tokens_seen": 30618272, + "step": 33845 + }, + { + "epoch": 8.933878843869605, + "grad_norm": 0.002313294680789113, + "learning_rate": 0.017166017088762153, + "loss": 0.017, + "num_input_tokens_seen": 30622656, + "step": 33850 + }, + { + "epoch": 8.935198627425102, + "grad_norm": 0.0003465784539002925, + "learning_rate": 0.017138664537184878, + "loss": 0.0241, + "num_input_tokens_seen": 30627200, + "step": 33855 + }, + { + "epoch": 8.936518410980598, + "grad_norm": 0.0013536810874938965, + "learning_rate": 0.017111332474495172, + "loss": 0.044, + "num_input_tokens_seen": 30631776, + "step": 33860 + }, + { + "epoch": 8.937838194536097, + "grad_norm": 0.00538845919072628, + "learning_rate": 0.017084020904907998, + "loss": 0.0306, + "num_input_tokens_seen": 30636544, + "step": 33865 + }, + { + "epoch": 8.939157978091593, + "grad_norm": 0.0011986973695456982, + "learning_rate": 0.017056729832635103, + "loss": 0.0236, + "num_input_tokens_seen": 30641152, + "step": 33870 + }, + { + "epoch": 8.94047776164709, + "grad_norm": 0.0007881926139816642, + "learning_rate": 0.017029459261885153, + "loss": 0.0263, + "num_input_tokens_seen": 30645824, + "step": 33875 + }, + { + "epoch": 8.941797545202586, + "grad_norm": 0.0004072538285981864, + "learning_rate": 0.01700220919686359, + "loss": 0.0495, + "num_input_tokens_seen": 30650368, + "step": 33880 + }, + { + "epoch": 8.943117328758083, + "grad_norm": 0.00039344170363619924, + "learning_rate": 0.016974979641772723, + "loss": 0.0081, + "num_input_tokens_seen": 30654496, + "step": 33885 + }, + { + "epoch": 8.944437112313581, + "grad_norm": 0.002861074637621641, + "learning_rate": 0.01694777060081169, + "loss": 0.0215, + "num_input_tokens_seen": 30659040, + "step": 33890 + }, + { + "epoch": 8.945756895869078, + "grad_norm": 0.0032978542149066925, + "learning_rate": 0.016920582078176444, + "loss": 0.0377, + "num_input_tokens_seen": 30663232, + "step": 33895 + }, + { + "epoch": 8.947076679424574, + "grad_norm": 0.0009081665193662047, + "learning_rate": 0.016893414078059863, + "loss": 0.0077, + "num_input_tokens_seen": 30667744, + "step": 33900 + }, + { + "epoch": 8.948396462980071, + "grad_norm": 0.0007339133298955858, + "learning_rate": 0.016866266604651535, + "loss": 0.0298, + "num_input_tokens_seen": 30672384, + "step": 33905 + }, + { + "epoch": 8.949716246535568, + "grad_norm": 0.003489135066047311, + "learning_rate": 0.016839139662137976, + "loss": 0.0348, + "num_input_tokens_seen": 30677056, + "step": 33910 + }, + { + "epoch": 8.951036030091066, + "grad_norm": 0.0016013154527172446, + "learning_rate": 0.01681203325470245, + "loss": 0.0066, + "num_input_tokens_seen": 30681472, + "step": 33915 + }, + { + "epoch": 8.952355813646562, + "grad_norm": 0.0020116199739277363, + "learning_rate": 0.016784947386525157, + "loss": 0.0108, + "num_input_tokens_seen": 30686272, + "step": 33920 + }, + { + "epoch": 8.953675597202059, + "grad_norm": 0.005687739234417677, + "learning_rate": 0.01675788206178308, + "loss": 0.0186, + "num_input_tokens_seen": 30690592, + "step": 33925 + }, + { + "epoch": 8.954995380757556, + "grad_norm": 0.0005168596981093287, + "learning_rate": 0.016730837284649986, + "loss": 0.0271, + "num_input_tokens_seen": 30695104, + "step": 33930 + }, + { + "epoch": 8.956315164313052, + "grad_norm": 0.004312640056014061, + "learning_rate": 0.016703813059296583, + "loss": 0.0386, + "num_input_tokens_seen": 30699744, + "step": 33935 + }, + { + "epoch": 8.957634947868549, + "grad_norm": 0.004973187111318111, + "learning_rate": 0.016676809389890294, + "loss": 0.0496, + "num_input_tokens_seen": 30704352, + "step": 33940 + }, + { + "epoch": 8.958954731424047, + "grad_norm": 0.0030424545984715223, + "learning_rate": 0.016649826280595435, + "loss": 0.0233, + "num_input_tokens_seen": 30708800, + "step": 33945 + }, + { + "epoch": 8.960274514979544, + "grad_norm": 0.004586965311318636, + "learning_rate": 0.016622863735573163, + "loss": 0.0568, + "num_input_tokens_seen": 30713600, + "step": 33950 + }, + { + "epoch": 8.96159429853504, + "grad_norm": 0.0020460253581404686, + "learning_rate": 0.016595921758981395, + "loss": 0.0121, + "num_input_tokens_seen": 30718112, + "step": 33955 + }, + { + "epoch": 8.962914082090537, + "grad_norm": 0.001077024731785059, + "learning_rate": 0.01656900035497495, + "loss": 0.0262, + "num_input_tokens_seen": 30723040, + "step": 33960 + }, + { + "epoch": 8.964233865646033, + "grad_norm": 0.0019779051654040813, + "learning_rate": 0.016542099527705485, + "loss": 0.0104, + "num_input_tokens_seen": 30727712, + "step": 33965 + }, + { + "epoch": 8.965553649201532, + "grad_norm": 0.0002923339488916099, + "learning_rate": 0.01651521928132138, + "loss": 0.0268, + "num_input_tokens_seen": 30732288, + "step": 33970 + }, + { + "epoch": 8.966873432757028, + "grad_norm": 0.0015665678074583411, + "learning_rate": 0.01648835961996794, + "loss": 0.0271, + "num_input_tokens_seen": 30736832, + "step": 33975 + }, + { + "epoch": 8.968193216312525, + "grad_norm": 0.0015021952567622066, + "learning_rate": 0.016461520547787285, + "loss": 0.0092, + "num_input_tokens_seen": 30741440, + "step": 33980 + }, + { + "epoch": 8.969512999868021, + "grad_norm": 0.0013511432334780693, + "learning_rate": 0.016434702068918266, + "loss": 0.0332, + "num_input_tokens_seen": 30745952, + "step": 33985 + }, + { + "epoch": 8.970832783423518, + "grad_norm": 0.0001734997786115855, + "learning_rate": 0.01640790418749673, + "loss": 0.0307, + "num_input_tokens_seen": 30750496, + "step": 33990 + }, + { + "epoch": 8.972152566979016, + "grad_norm": 0.0001513993483968079, + "learning_rate": 0.016381126907655134, + "loss": 0.0076, + "num_input_tokens_seen": 30755072, + "step": 33995 + }, + { + "epoch": 8.973472350534513, + "grad_norm": 0.0006940297898836434, + "learning_rate": 0.016354370233522948, + "loss": 0.0232, + "num_input_tokens_seen": 30759200, + "step": 34000 + }, + { + "epoch": 8.973472350534513, + "eval_loss": 0.10043871402740479, + "eval_runtime": 75.9177, + "eval_samples_per_second": 88.714, + "eval_steps_per_second": 22.182, + "num_input_tokens_seen": 30759200, + "step": 34000 + }, + { + "epoch": 8.97479213409001, + "grad_norm": 0.0005209074588492513, + "learning_rate": 0.016327634169226394, + "loss": 0.0156, + "num_input_tokens_seen": 30763808, + "step": 34005 + }, + { + "epoch": 8.976111917645506, + "grad_norm": 0.007982178591191769, + "learning_rate": 0.016300918718888485, + "loss": 0.0309, + "num_input_tokens_seen": 30768448, + "step": 34010 + }, + { + "epoch": 8.977431701201002, + "grad_norm": 0.0012366431765258312, + "learning_rate": 0.016274223886629052, + "loss": 0.054, + "num_input_tokens_seen": 30773056, + "step": 34015 + }, + { + "epoch": 8.9787514847565, + "grad_norm": 0.005075027700513601, + "learning_rate": 0.01624754967656482, + "loss": 0.0327, + "num_input_tokens_seen": 30777280, + "step": 34020 + }, + { + "epoch": 8.980071268311997, + "grad_norm": 0.0013004039647057652, + "learning_rate": 0.016220896092809235, + "loss": 0.0318, + "num_input_tokens_seen": 30781664, + "step": 34025 + }, + { + "epoch": 8.981391051867494, + "grad_norm": 0.00038976268842816353, + "learning_rate": 0.01619426313947267, + "loss": 0.0142, + "num_input_tokens_seen": 30785728, + "step": 34030 + }, + { + "epoch": 8.98271083542299, + "grad_norm": 0.001257851137779653, + "learning_rate": 0.016167650820662228, + "loss": 0.0275, + "num_input_tokens_seen": 30790112, + "step": 34035 + }, + { + "epoch": 8.984030618978487, + "grad_norm": 0.0019238363020122051, + "learning_rate": 0.016141059140481855, + "loss": 0.009, + "num_input_tokens_seen": 30794400, + "step": 34040 + }, + { + "epoch": 8.985350402533985, + "grad_norm": 0.00014265273057390004, + "learning_rate": 0.016114488103032374, + "loss": 0.0084, + "num_input_tokens_seen": 30799104, + "step": 34045 + }, + { + "epoch": 8.986670186089482, + "grad_norm": 0.001211281749419868, + "learning_rate": 0.016087937712411293, + "loss": 0.0208, + "num_input_tokens_seen": 30803712, + "step": 34050 + }, + { + "epoch": 8.987989969644978, + "grad_norm": 0.0003369374026078731, + "learning_rate": 0.01606140797271308, + "loss": 0.0273, + "num_input_tokens_seen": 30807840, + "step": 34055 + }, + { + "epoch": 8.989309753200475, + "grad_norm": 0.0006956777069717646, + "learning_rate": 0.01603489888802897, + "loss": 0.0286, + "num_input_tokens_seen": 30812224, + "step": 34060 + }, + { + "epoch": 8.990629536755971, + "grad_norm": 0.00031527108512818813, + "learning_rate": 0.016008410462446918, + "loss": 0.0491, + "num_input_tokens_seen": 30816608, + "step": 34065 + }, + { + "epoch": 8.99194932031147, + "grad_norm": 0.00017847628623712808, + "learning_rate": 0.01598194270005185, + "loss": 0.0066, + "num_input_tokens_seen": 30821184, + "step": 34070 + }, + { + "epoch": 8.993269103866966, + "grad_norm": 0.0004973767790943384, + "learning_rate": 0.015955495604925356, + "loss": 0.0081, + "num_input_tokens_seen": 30825568, + "step": 34075 + }, + { + "epoch": 8.994588887422463, + "grad_norm": 0.00011996759712928906, + "learning_rate": 0.01592906918114598, + "loss": 0.0128, + "num_input_tokens_seen": 30830240, + "step": 34080 + }, + { + "epoch": 8.99590867097796, + "grad_norm": 0.002286567585542798, + "learning_rate": 0.015902663432788965, + "loss": 0.0218, + "num_input_tokens_seen": 30834784, + "step": 34085 + }, + { + "epoch": 8.997228454533456, + "grad_norm": 0.0006301800603978336, + "learning_rate": 0.01587627836392643, + "loss": 0.0261, + "num_input_tokens_seen": 30839648, + "step": 34090 + }, + { + "epoch": 8.998548238088954, + "grad_norm": 0.0006869210046716034, + "learning_rate": 0.01584991397862726, + "loss": 0.0383, + "num_input_tokens_seen": 30844256, + "step": 34095 + }, + { + "epoch": 8.99986802164445, + "grad_norm": 0.002484502736479044, + "learning_rate": 0.015823570280957214, + "loss": 0.0229, + "num_input_tokens_seen": 30848704, + "step": 34100 + }, + { + "epoch": 9.001055826844398, + "grad_norm": 0.0006977661396376789, + "learning_rate": 0.015797247274978766, + "loss": 0.0289, + "num_input_tokens_seen": 30852960, + "step": 34105 + }, + { + "epoch": 9.002375610399895, + "grad_norm": 0.00036026843008585274, + "learning_rate": 0.015770944964751326, + "loss": 0.0339, + "num_input_tokens_seen": 30857248, + "step": 34110 + }, + { + "epoch": 9.003695393955391, + "grad_norm": 0.002174572553485632, + "learning_rate": 0.015744663354330956, + "loss": 0.0176, + "num_input_tokens_seen": 30861504, + "step": 34115 + }, + { + "epoch": 9.005015177510888, + "grad_norm": 0.0005922582349739969, + "learning_rate": 0.015718402447770664, + "loss": 0.0108, + "num_input_tokens_seen": 30866176, + "step": 34120 + }, + { + "epoch": 9.006334961066385, + "grad_norm": 0.0006311554461717606, + "learning_rate": 0.015692162249120224, + "loss": 0.0136, + "num_input_tokens_seen": 30870624, + "step": 34125 + }, + { + "epoch": 9.007654744621883, + "grad_norm": 0.0007433770224452019, + "learning_rate": 0.01566594276242615, + "loss": 0.0104, + "num_input_tokens_seen": 30875552, + "step": 34130 + }, + { + "epoch": 9.00897452817738, + "grad_norm": 0.0020649658981710672, + "learning_rate": 0.015639743991731857, + "loss": 0.0119, + "num_input_tokens_seen": 30880256, + "step": 34135 + }, + { + "epoch": 9.010294311732876, + "grad_norm": 0.0006152098649181426, + "learning_rate": 0.01561356594107755, + "loss": 0.0244, + "num_input_tokens_seen": 30884448, + "step": 34140 + }, + { + "epoch": 9.011614095288373, + "grad_norm": 0.003802219172939658, + "learning_rate": 0.015587408614500147, + "loss": 0.0098, + "num_input_tokens_seen": 30888928, + "step": 34145 + }, + { + "epoch": 9.012933878843869, + "grad_norm": 0.0007259786361828446, + "learning_rate": 0.015561272016033505, + "loss": 0.0117, + "num_input_tokens_seen": 30893536, + "step": 34150 + }, + { + "epoch": 9.014253662399366, + "grad_norm": 0.00414647813886404, + "learning_rate": 0.015535156149708167, + "loss": 0.0172, + "num_input_tokens_seen": 30898208, + "step": 34155 + }, + { + "epoch": 9.015573445954864, + "grad_norm": 0.0019407771760597825, + "learning_rate": 0.015509061019551528, + "loss": 0.0209, + "num_input_tokens_seen": 30903168, + "step": 34160 + }, + { + "epoch": 9.01689322951036, + "grad_norm": 0.005043444689363241, + "learning_rate": 0.015482986629587818, + "loss": 0.0248, + "num_input_tokens_seen": 30907680, + "step": 34165 + }, + { + "epoch": 9.018213013065857, + "grad_norm": 0.00018630563863553107, + "learning_rate": 0.01545693298383799, + "loss": 0.014, + "num_input_tokens_seen": 30912288, + "step": 34170 + }, + { + "epoch": 9.019532796621354, + "grad_norm": 0.0011430502636358142, + "learning_rate": 0.015430900086319858, + "loss": 0.0247, + "num_input_tokens_seen": 30916640, + "step": 34175 + }, + { + "epoch": 9.02085258017685, + "grad_norm": 0.0008276999578811228, + "learning_rate": 0.015404887941048084, + "loss": 0.016, + "num_input_tokens_seen": 30921120, + "step": 34180 + }, + { + "epoch": 9.022172363732349, + "grad_norm": 0.002352982060983777, + "learning_rate": 0.01537889655203397, + "loss": 0.0087, + "num_input_tokens_seen": 30925536, + "step": 34185 + }, + { + "epoch": 9.023492147287845, + "grad_norm": 0.0004608544986695051, + "learning_rate": 0.015352925923285798, + "loss": 0.0148, + "num_input_tokens_seen": 30930080, + "step": 34190 + }, + { + "epoch": 9.024811930843342, + "grad_norm": 0.0014576666289940476, + "learning_rate": 0.015326976058808511, + "loss": 0.0318, + "num_input_tokens_seen": 30934336, + "step": 34195 + }, + { + "epoch": 9.026131714398838, + "grad_norm": 0.0003447327471803874, + "learning_rate": 0.015301046962603908, + "loss": 0.0119, + "num_input_tokens_seen": 30938880, + "step": 34200 + }, + { + "epoch": 9.026131714398838, + "eval_loss": 0.10300639271736145, + "eval_runtime": 76.0131, + "eval_samples_per_second": 88.603, + "eval_steps_per_second": 22.154, + "num_input_tokens_seen": 30938880, + "step": 34200 + }, + { + "epoch": 9.027451497954335, + "grad_norm": 0.0018318596994504333, + "learning_rate": 0.015275138638670626, + "loss": 0.0162, + "num_input_tokens_seen": 30943488, + "step": 34205 + }, + { + "epoch": 9.028771281509833, + "grad_norm": 0.0028962763026356697, + "learning_rate": 0.015249251091004001, + "loss": 0.0237, + "num_input_tokens_seen": 30948160, + "step": 34210 + }, + { + "epoch": 9.03009106506533, + "grad_norm": 0.0015935087576508522, + "learning_rate": 0.01522338432359624, + "loss": 0.0096, + "num_input_tokens_seen": 30952704, + "step": 34215 + }, + { + "epoch": 9.031410848620826, + "grad_norm": 0.0011771244462579489, + "learning_rate": 0.01519753834043635, + "loss": 0.0149, + "num_input_tokens_seen": 30957440, + "step": 34220 + }, + { + "epoch": 9.032730632176323, + "grad_norm": 0.0010522565571591258, + "learning_rate": 0.015171713145510095, + "loss": 0.0193, + "num_input_tokens_seen": 30962016, + "step": 34225 + }, + { + "epoch": 9.03405041573182, + "grad_norm": 0.00029833897133357823, + "learning_rate": 0.01514590874279999, + "loss": 0.0051, + "num_input_tokens_seen": 30966656, + "step": 34230 + }, + { + "epoch": 9.035370199287318, + "grad_norm": 0.0008488120511174202, + "learning_rate": 0.015120125136285467, + "loss": 0.0383, + "num_input_tokens_seen": 30971136, + "step": 34235 + }, + { + "epoch": 9.036689982842814, + "grad_norm": 0.0015685432590544224, + "learning_rate": 0.015094362329942629, + "loss": 0.0112, + "num_input_tokens_seen": 30975648, + "step": 34240 + }, + { + "epoch": 9.03800976639831, + "grad_norm": 0.0014540569391101599, + "learning_rate": 0.01506862032774448, + "loss": 0.0101, + "num_input_tokens_seen": 30980000, + "step": 34245 + }, + { + "epoch": 9.039329549953807, + "grad_norm": 0.0004361648461781442, + "learning_rate": 0.015042899133660697, + "loss": 0.013, + "num_input_tokens_seen": 30984512, + "step": 34250 + }, + { + "epoch": 9.040649333509304, + "grad_norm": 0.0030216595623642206, + "learning_rate": 0.01501719875165789, + "loss": 0.0272, + "num_input_tokens_seen": 30989056, + "step": 34255 + }, + { + "epoch": 9.041969117064802, + "grad_norm": 0.0016821780009195209, + "learning_rate": 0.014991519185699286, + "loss": 0.0177, + "num_input_tokens_seen": 30993280, + "step": 34260 + }, + { + "epoch": 9.043288900620299, + "grad_norm": 0.00037941435584798455, + "learning_rate": 0.014965860439745054, + "loss": 0.0061, + "num_input_tokens_seen": 30997824, + "step": 34265 + }, + { + "epoch": 9.044608684175795, + "grad_norm": 0.0005733160069212317, + "learning_rate": 0.01494022251775211, + "loss": 0.0113, + "num_input_tokens_seen": 31002080, + "step": 34270 + }, + { + "epoch": 9.045928467731292, + "grad_norm": 0.00039441155968233943, + "learning_rate": 0.014914605423674109, + "loss": 0.0059, + "num_input_tokens_seen": 31006688, + "step": 34275 + }, + { + "epoch": 9.047248251286788, + "grad_norm": 0.0008035302744247019, + "learning_rate": 0.014889009161461525, + "loss": 0.0066, + "num_input_tokens_seen": 31011104, + "step": 34280 + }, + { + "epoch": 9.048568034842285, + "grad_norm": 0.0005850319867022336, + "learning_rate": 0.014863433735061665, + "loss": 0.0173, + "num_input_tokens_seen": 31015360, + "step": 34285 + }, + { + "epoch": 9.049887818397783, + "grad_norm": 8.859234367264435e-05, + "learning_rate": 0.014837879148418541, + "loss": 0.0148, + "num_input_tokens_seen": 31019904, + "step": 34290 + }, + { + "epoch": 9.05120760195328, + "grad_norm": 0.0008593970560468733, + "learning_rate": 0.01481234540547302, + "loss": 0.0255, + "num_input_tokens_seen": 31024704, + "step": 34295 + }, + { + "epoch": 9.052527385508776, + "grad_norm": 0.0013854641001671553, + "learning_rate": 0.014786832510162717, + "loss": 0.0174, + "num_input_tokens_seen": 31029184, + "step": 34300 + }, + { + "epoch": 9.053847169064273, + "grad_norm": 0.0038361335173249245, + "learning_rate": 0.014761340466422017, + "loss": 0.0098, + "num_input_tokens_seen": 31033696, + "step": 34305 + }, + { + "epoch": 9.05516695261977, + "grad_norm": 0.0024526596534997225, + "learning_rate": 0.014735869278182144, + "loss": 0.0291, + "num_input_tokens_seen": 31038112, + "step": 34310 + }, + { + "epoch": 9.056486736175268, + "grad_norm": 0.0012869815109297633, + "learning_rate": 0.014710418949371057, + "loss": 0.0096, + "num_input_tokens_seen": 31042912, + "step": 34315 + }, + { + "epoch": 9.057806519730764, + "grad_norm": 0.0014149488415569067, + "learning_rate": 0.014684989483913495, + "loss": 0.0254, + "num_input_tokens_seen": 31047680, + "step": 34320 + }, + { + "epoch": 9.059126303286261, + "grad_norm": 0.000165181074407883, + "learning_rate": 0.014659580885731077, + "loss": 0.0209, + "num_input_tokens_seen": 31052384, + "step": 34325 + }, + { + "epoch": 9.060446086841758, + "grad_norm": 0.0009667830890975893, + "learning_rate": 0.014634193158742047, + "loss": 0.0115, + "num_input_tokens_seen": 31056704, + "step": 34330 + }, + { + "epoch": 9.061765870397254, + "grad_norm": 0.002021957188844681, + "learning_rate": 0.014608826306861576, + "loss": 0.0204, + "num_input_tokens_seen": 31061248, + "step": 34335 + }, + { + "epoch": 9.063085653952752, + "grad_norm": 0.002167664235457778, + "learning_rate": 0.014583480334001486, + "loss": 0.0173, + "num_input_tokens_seen": 31065792, + "step": 34340 + }, + { + "epoch": 9.064405437508249, + "grad_norm": 0.00035598609247244895, + "learning_rate": 0.014558155244070496, + "loss": 0.02, + "num_input_tokens_seen": 31070592, + "step": 34345 + }, + { + "epoch": 9.065725221063746, + "grad_norm": 0.00036851176992058754, + "learning_rate": 0.014532851040974036, + "loss": 0.0215, + "num_input_tokens_seen": 31074880, + "step": 34350 + }, + { + "epoch": 9.067045004619242, + "grad_norm": 0.00017212494276463985, + "learning_rate": 0.014507567728614335, + "loss": 0.0085, + "num_input_tokens_seen": 31079264, + "step": 34355 + }, + { + "epoch": 9.068364788174739, + "grad_norm": 0.00042419484816491604, + "learning_rate": 0.01448230531089037, + "loss": 0.0139, + "num_input_tokens_seen": 31083680, + "step": 34360 + }, + { + "epoch": 9.069684571730237, + "grad_norm": 8.355370664503425e-05, + "learning_rate": 0.014457063791697993, + "loss": 0.0058, + "num_input_tokens_seen": 31088672, + "step": 34365 + }, + { + "epoch": 9.071004355285734, + "grad_norm": 0.001109978067688644, + "learning_rate": 0.01443184317492971, + "loss": 0.0234, + "num_input_tokens_seen": 31093280, + "step": 34370 + }, + { + "epoch": 9.07232413884123, + "grad_norm": 0.0015847510658204556, + "learning_rate": 0.014406643464474822, + "loss": 0.0049, + "num_input_tokens_seen": 31097888, + "step": 34375 + }, + { + "epoch": 9.073643922396727, + "grad_norm": 0.001958612585440278, + "learning_rate": 0.014381464664219539, + "loss": 0.0202, + "num_input_tokens_seen": 31102240, + "step": 34380 + }, + { + "epoch": 9.074963705952223, + "grad_norm": 0.0037514783907681704, + "learning_rate": 0.014356306778046656, + "loss": 0.0288, + "num_input_tokens_seen": 31107040, + "step": 34385 + }, + { + "epoch": 9.076283489507722, + "grad_norm": 0.0004941488732583821, + "learning_rate": 0.014331169809835885, + "loss": 0.0226, + "num_input_tokens_seen": 31111584, + "step": 34390 + }, + { + "epoch": 9.077603273063218, + "grad_norm": 0.0021983436308801174, + "learning_rate": 0.014306053763463644, + "loss": 0.0231, + "num_input_tokens_seen": 31116000, + "step": 34395 + }, + { + "epoch": 9.078923056618715, + "grad_norm": 0.0006378380348905921, + "learning_rate": 0.014280958642803147, + "loss": 0.013, + "num_input_tokens_seen": 31120480, + "step": 34400 + }, + { + "epoch": 9.078923056618715, + "eval_loss": 0.10439366102218628, + "eval_runtime": 75.9883, + "eval_samples_per_second": 88.632, + "eval_steps_per_second": 22.161, + "num_input_tokens_seen": 31120480, + "step": 34400 + }, + { + "epoch": 9.080242840174211, + "grad_norm": 0.0006438057753257453, + "learning_rate": 0.014255884451724404, + "loss": 0.0201, + "num_input_tokens_seen": 31125056, + "step": 34405 + }, + { + "epoch": 9.081562623729708, + "grad_norm": 0.0002489647886250168, + "learning_rate": 0.014230831194094101, + "loss": 0.0209, + "num_input_tokens_seen": 31129568, + "step": 34410 + }, + { + "epoch": 9.082882407285204, + "grad_norm": 0.002493489533662796, + "learning_rate": 0.014205798873775865, + "loss": 0.0154, + "num_input_tokens_seen": 31134144, + "step": 34415 + }, + { + "epoch": 9.084202190840703, + "grad_norm": 8.787799743004143e-05, + "learning_rate": 0.014180787494629893, + "loss": 0.0303, + "num_input_tokens_seen": 31138688, + "step": 34420 + }, + { + "epoch": 9.0855219743962, + "grad_norm": 0.0056268926709890366, + "learning_rate": 0.014155797060513314, + "loss": 0.0233, + "num_input_tokens_seen": 31143072, + "step": 34425 + }, + { + "epoch": 9.086841757951696, + "grad_norm": 0.00023765563673805445, + "learning_rate": 0.014130827575279963, + "loss": 0.0313, + "num_input_tokens_seen": 31147648, + "step": 34430 + }, + { + "epoch": 9.088161541507192, + "grad_norm": 0.0009485400514677167, + "learning_rate": 0.014105879042780427, + "loss": 0.0163, + "num_input_tokens_seen": 31152544, + "step": 34435 + }, + { + "epoch": 9.089481325062689, + "grad_norm": 0.0014709309907630086, + "learning_rate": 0.014080951466862113, + "loss": 0.0058, + "num_input_tokens_seen": 31157184, + "step": 34440 + }, + { + "epoch": 9.090801108618187, + "grad_norm": 0.0030413733329623938, + "learning_rate": 0.014056044851369126, + "loss": 0.0162, + "num_input_tokens_seen": 31161888, + "step": 34445 + }, + { + "epoch": 9.092120892173684, + "grad_norm": 0.0004029013798572123, + "learning_rate": 0.014031159200142428, + "loss": 0.005, + "num_input_tokens_seen": 31166496, + "step": 34450 + }, + { + "epoch": 9.09344067572918, + "grad_norm": 0.0010436212178319693, + "learning_rate": 0.014006294517019667, + "loss": 0.0081, + "num_input_tokens_seen": 31170752, + "step": 34455 + }, + { + "epoch": 9.094760459284677, + "grad_norm": 0.0018430798081681132, + "learning_rate": 0.013981450805835276, + "loss": 0.0178, + "num_input_tokens_seen": 31175328, + "step": 34460 + }, + { + "epoch": 9.096080242840173, + "grad_norm": 0.0009987022494897246, + "learning_rate": 0.01395662807042049, + "loss": 0.0197, + "num_input_tokens_seen": 31179712, + "step": 34465 + }, + { + "epoch": 9.097400026395672, + "grad_norm": 0.0008583693415857852, + "learning_rate": 0.013931826314603296, + "loss": 0.0078, + "num_input_tokens_seen": 31184096, + "step": 34470 + }, + { + "epoch": 9.098719809951168, + "grad_norm": 0.0032674716785550117, + "learning_rate": 0.013907045542208401, + "loss": 0.026, + "num_input_tokens_seen": 31188576, + "step": 34475 + }, + { + "epoch": 9.100039593506665, + "grad_norm": 0.0007755915867164731, + "learning_rate": 0.013882285757057333, + "loss": 0.0076, + "num_input_tokens_seen": 31193184, + "step": 34480 + }, + { + "epoch": 9.101359377062161, + "grad_norm": 2.5731224013725296e-05, + "learning_rate": 0.013857546962968403, + "loss": 0.0162, + "num_input_tokens_seen": 31197536, + "step": 34485 + }, + { + "epoch": 9.102679160617658, + "grad_norm": 0.0015280982479453087, + "learning_rate": 0.013832829163756577, + "loss": 0.0074, + "num_input_tokens_seen": 31201888, + "step": 34490 + }, + { + "epoch": 9.103998944173156, + "grad_norm": 0.00020323747594375163, + "learning_rate": 0.013808132363233689, + "loss": 0.018, + "num_input_tokens_seen": 31206656, + "step": 34495 + }, + { + "epoch": 9.105318727728653, + "grad_norm": 0.0012669494608417153, + "learning_rate": 0.013783456565208256, + "loss": 0.0136, + "num_input_tokens_seen": 31210944, + "step": 34500 + }, + { + "epoch": 9.10663851128415, + "grad_norm": 0.0006236141198314726, + "learning_rate": 0.01375880177348564, + "loss": 0.0074, + "num_input_tokens_seen": 31215488, + "step": 34505 + }, + { + "epoch": 9.107958294839646, + "grad_norm": 0.00101556652225554, + "learning_rate": 0.013734167991867928, + "loss": 0.0086, + "num_input_tokens_seen": 31220192, + "step": 34510 + }, + { + "epoch": 9.109278078395143, + "grad_norm": 0.001281090546399355, + "learning_rate": 0.013709555224153935, + "loss": 0.0286, + "num_input_tokens_seen": 31224768, + "step": 34515 + }, + { + "epoch": 9.110597861950641, + "grad_norm": 0.0005429329467006028, + "learning_rate": 0.013684963474139222, + "loss": 0.0311, + "num_input_tokens_seen": 31229184, + "step": 34520 + }, + { + "epoch": 9.111917645506137, + "grad_norm": 0.0018207263201475143, + "learning_rate": 0.013660392745616224, + "loss": 0.0113, + "num_input_tokens_seen": 31233792, + "step": 34525 + }, + { + "epoch": 9.113237429061634, + "grad_norm": 0.0016170659800991416, + "learning_rate": 0.013635843042373974, + "loss": 0.0204, + "num_input_tokens_seen": 31238400, + "step": 34530 + }, + { + "epoch": 9.11455721261713, + "grad_norm": 0.00028899009339511395, + "learning_rate": 0.01361131436819843, + "loss": 0.0059, + "num_input_tokens_seen": 31242496, + "step": 34535 + }, + { + "epoch": 9.115876996172627, + "grad_norm": 0.0002519276749808341, + "learning_rate": 0.013586806726872147, + "loss": 0.0084, + "num_input_tokens_seen": 31247136, + "step": 34540 + }, + { + "epoch": 9.117196779728125, + "grad_norm": 0.0002537187247071415, + "learning_rate": 0.013562320122174537, + "loss": 0.0032, + "num_input_tokens_seen": 31251840, + "step": 34545 + }, + { + "epoch": 9.118516563283622, + "grad_norm": 0.0009793516946956515, + "learning_rate": 0.013537854557881762, + "loss": 0.0049, + "num_input_tokens_seen": 31256288, + "step": 34550 + }, + { + "epoch": 9.119836346839119, + "grad_norm": 0.0009561321930959821, + "learning_rate": 0.013513410037766687, + "loss": 0.0163, + "num_input_tokens_seen": 31260544, + "step": 34555 + }, + { + "epoch": 9.121156130394615, + "grad_norm": 0.0007836767472326756, + "learning_rate": 0.013488986565598998, + "loss": 0.0086, + "num_input_tokens_seen": 31264864, + "step": 34560 + }, + { + "epoch": 9.122475913950112, + "grad_norm": 0.00039524963358417153, + "learning_rate": 0.013464584145145097, + "loss": 0.0154, + "num_input_tokens_seen": 31269408, + "step": 34565 + }, + { + "epoch": 9.123795697505608, + "grad_norm": 0.00131582235917449, + "learning_rate": 0.013440202780168109, + "loss": 0.0067, + "num_input_tokens_seen": 31274272, + "step": 34570 + }, + { + "epoch": 9.125115481061107, + "grad_norm": 0.002621653024107218, + "learning_rate": 0.01341584247442799, + "loss": 0.021, + "num_input_tokens_seen": 31278880, + "step": 34575 + }, + { + "epoch": 9.126435264616603, + "grad_norm": 0.0022976738400757313, + "learning_rate": 0.013391503231681355, + "loss": 0.0276, + "num_input_tokens_seen": 31283360, + "step": 34580 + }, + { + "epoch": 9.1277550481721, + "grad_norm": 0.00038101477548480034, + "learning_rate": 0.013367185055681685, + "loss": 0.0116, + "num_input_tokens_seen": 31287744, + "step": 34585 + }, + { + "epoch": 9.129074831727596, + "grad_norm": 0.003299339674413204, + "learning_rate": 0.013342887950179095, + "loss": 0.0125, + "num_input_tokens_seen": 31292064, + "step": 34590 + }, + { + "epoch": 9.130394615283093, + "grad_norm": 0.004125641658902168, + "learning_rate": 0.013318611918920554, + "loss": 0.0776, + "num_input_tokens_seen": 31296544, + "step": 34595 + }, + { + "epoch": 9.131714398838591, + "grad_norm": 0.00031927580130286515, + "learning_rate": 0.01329435696564965, + "loss": 0.0235, + "num_input_tokens_seen": 31301056, + "step": 34600 + }, + { + "epoch": 9.131714398838591, + "eval_loss": 0.1055363342165947, + "eval_runtime": 75.8961, + "eval_samples_per_second": 88.74, + "eval_steps_per_second": 22.188, + "num_input_tokens_seen": 31301056, + "step": 34600 + }, + { + "epoch": 9.133034182394088, + "grad_norm": 0.002571467775851488, + "learning_rate": 0.013270123094106894, + "loss": 0.0087, + "num_input_tokens_seen": 31305440, + "step": 34605 + }, + { + "epoch": 9.134353965949584, + "grad_norm": 0.0011270138202235103, + "learning_rate": 0.013245910308029395, + "loss": 0.0143, + "num_input_tokens_seen": 31309824, + "step": 34610 + }, + { + "epoch": 9.13567374950508, + "grad_norm": 0.002234142739325762, + "learning_rate": 0.0132217186111511, + "loss": 0.0128, + "num_input_tokens_seen": 31313920, + "step": 34615 + }, + { + "epoch": 9.136993533060577, + "grad_norm": 0.00022554605675395578, + "learning_rate": 0.013197548007202626, + "loss": 0.0106, + "num_input_tokens_seen": 31318208, + "step": 34620 + }, + { + "epoch": 9.138313316616076, + "grad_norm": 0.0003564437211025506, + "learning_rate": 0.01317339849991142, + "loss": 0.0067, + "num_input_tokens_seen": 31322816, + "step": 34625 + }, + { + "epoch": 9.139633100171572, + "grad_norm": 0.0014194775139912963, + "learning_rate": 0.013149270093001675, + "loss": 0.0112, + "num_input_tokens_seen": 31327360, + "step": 34630 + }, + { + "epoch": 9.140952883727069, + "grad_norm": 0.003950096201151609, + "learning_rate": 0.013125162790194227, + "loss": 0.0195, + "num_input_tokens_seen": 31332064, + "step": 34635 + }, + { + "epoch": 9.142272667282565, + "grad_norm": 0.00036806147545576096, + "learning_rate": 0.01310107659520674, + "loss": 0.0329, + "num_input_tokens_seen": 31336288, + "step": 34640 + }, + { + "epoch": 9.143592450838062, + "grad_norm": 0.0026069371961057186, + "learning_rate": 0.013077011511753655, + "loss": 0.0204, + "num_input_tokens_seen": 31340544, + "step": 34645 + }, + { + "epoch": 9.14491223439356, + "grad_norm": 0.0033171221148222685, + "learning_rate": 0.013052967543546056, + "loss": 0.0192, + "num_input_tokens_seen": 31344896, + "step": 34650 + }, + { + "epoch": 9.146232017949057, + "grad_norm": 0.0012202602811157703, + "learning_rate": 0.01302894469429186, + "loss": 0.032, + "num_input_tokens_seen": 31349440, + "step": 34655 + }, + { + "epoch": 9.147551801504553, + "grad_norm": 0.001119745895266533, + "learning_rate": 0.013004942967695653, + "loss": 0.0098, + "num_input_tokens_seen": 31354048, + "step": 34660 + }, + { + "epoch": 9.14887158506005, + "grad_norm": 0.0006190053536556661, + "learning_rate": 0.012980962367458859, + "loss": 0.0191, + "num_input_tokens_seen": 31358720, + "step": 34665 + }, + { + "epoch": 9.150191368615546, + "grad_norm": 0.00014006023411639035, + "learning_rate": 0.012957002897279567, + "loss": 0.0124, + "num_input_tokens_seen": 31363520, + "step": 34670 + }, + { + "epoch": 9.151511152171043, + "grad_norm": 0.00033958128187805414, + "learning_rate": 0.012933064560852576, + "loss": 0.0097, + "num_input_tokens_seen": 31368128, + "step": 34675 + }, + { + "epoch": 9.152830935726541, + "grad_norm": 0.0009583854698576033, + "learning_rate": 0.012909147361869527, + "loss": 0.0158, + "num_input_tokens_seen": 31372864, + "step": 34680 + }, + { + "epoch": 9.154150719282038, + "grad_norm": 0.0008158156415447593, + "learning_rate": 0.012885251304018774, + "loss": 0.0177, + "num_input_tokens_seen": 31377280, + "step": 34685 + }, + { + "epoch": 9.155470502837534, + "grad_norm": 0.0009936976712197065, + "learning_rate": 0.012861376390985335, + "loss": 0.0153, + "num_input_tokens_seen": 31381952, + "step": 34690 + }, + { + "epoch": 9.156790286393031, + "grad_norm": 0.00045013081398792565, + "learning_rate": 0.012837522626451063, + "loss": 0.0164, + "num_input_tokens_seen": 31386784, + "step": 34695 + }, + { + "epoch": 9.158110069948528, + "grad_norm": 0.003215563716366887, + "learning_rate": 0.01281369001409447, + "loss": 0.0102, + "num_input_tokens_seen": 31391648, + "step": 34700 + }, + { + "epoch": 9.159429853504026, + "grad_norm": 0.0004889045376330614, + "learning_rate": 0.012789878557590877, + "loss": 0.016, + "num_input_tokens_seen": 31396256, + "step": 34705 + }, + { + "epoch": 9.160749637059522, + "grad_norm": 0.001186043256893754, + "learning_rate": 0.012766088260612334, + "loss": 0.0168, + "num_input_tokens_seen": 31400960, + "step": 34710 + }, + { + "epoch": 9.162069420615019, + "grad_norm": 0.0013924636878073215, + "learning_rate": 0.012742319126827523, + "loss": 0.0065, + "num_input_tokens_seen": 31405632, + "step": 34715 + }, + { + "epoch": 9.163389204170516, + "grad_norm": 0.002550938632339239, + "learning_rate": 0.012718571159902008, + "loss": 0.0259, + "num_input_tokens_seen": 31410080, + "step": 34720 + }, + { + "epoch": 9.164708987726012, + "grad_norm": 0.00111078517511487, + "learning_rate": 0.01269484436349803, + "loss": 0.0093, + "num_input_tokens_seen": 31414592, + "step": 34725 + }, + { + "epoch": 9.16602877128151, + "grad_norm": 0.00010832119005499408, + "learning_rate": 0.012671138741274528, + "loss": 0.0172, + "num_input_tokens_seen": 31419136, + "step": 34730 + }, + { + "epoch": 9.167348554837007, + "grad_norm": 0.001490532187744975, + "learning_rate": 0.012647454296887194, + "loss": 0.0153, + "num_input_tokens_seen": 31423552, + "step": 34735 + }, + { + "epoch": 9.168668338392504, + "grad_norm": 0.00141666061244905, + "learning_rate": 0.012623791033988507, + "loss": 0.019, + "num_input_tokens_seen": 31428064, + "step": 34740 + }, + { + "epoch": 9.169988121948, + "grad_norm": 0.001108787371776998, + "learning_rate": 0.012600148956227597, + "loss": 0.0149, + "num_input_tokens_seen": 31432576, + "step": 34745 + }, + { + "epoch": 9.171307905503497, + "grad_norm": 0.00010331127123208717, + "learning_rate": 0.012576528067250414, + "loss": 0.0138, + "num_input_tokens_seen": 31437152, + "step": 34750 + }, + { + "epoch": 9.172627689058995, + "grad_norm": 0.0011240183375775814, + "learning_rate": 0.012552928370699561, + "loss": 0.0191, + "num_input_tokens_seen": 31441760, + "step": 34755 + }, + { + "epoch": 9.173947472614492, + "grad_norm": 0.002649181755259633, + "learning_rate": 0.012529349870214411, + "loss": 0.022, + "num_input_tokens_seen": 31446208, + "step": 34760 + }, + { + "epoch": 9.175267256169988, + "grad_norm": 0.0031962846405804157, + "learning_rate": 0.012505792569431106, + "loss": 0.0144, + "num_input_tokens_seen": 31450432, + "step": 34765 + }, + { + "epoch": 9.176587039725485, + "grad_norm": 0.002800669288262725, + "learning_rate": 0.012482256471982422, + "loss": 0.0133, + "num_input_tokens_seen": 31455104, + "step": 34770 + }, + { + "epoch": 9.177906823280981, + "grad_norm": 0.000785275362432003, + "learning_rate": 0.012458741581497956, + "loss": 0.0157, + "num_input_tokens_seen": 31459392, + "step": 34775 + }, + { + "epoch": 9.17922660683648, + "grad_norm": 0.004710082430392504, + "learning_rate": 0.012435247901603974, + "loss": 0.0169, + "num_input_tokens_seen": 31463968, + "step": 34780 + }, + { + "epoch": 9.180546390391976, + "grad_norm": 0.000682963349390775, + "learning_rate": 0.012411775435923528, + "loss": 0.0288, + "num_input_tokens_seen": 31468256, + "step": 34785 + }, + { + "epoch": 9.181866173947473, + "grad_norm": 0.00023454143956769258, + "learning_rate": 0.012388324188076354, + "loss": 0.0189, + "num_input_tokens_seen": 31472736, + "step": 34790 + }, + { + "epoch": 9.18318595750297, + "grad_norm": 0.0006559199537150562, + "learning_rate": 0.012364894161678913, + "loss": 0.0139, + "num_input_tokens_seen": 31477376, + "step": 34795 + }, + { + "epoch": 9.184505741058466, + "grad_norm": 0.0019634165801107883, + "learning_rate": 0.012341485360344445, + "loss": 0.0111, + "num_input_tokens_seen": 31481824, + "step": 34800 + }, + { + "epoch": 9.184505741058466, + "eval_loss": 0.10844151675701141, + "eval_runtime": 75.8346, + "eval_samples_per_second": 88.812, + "eval_steps_per_second": 22.206, + "num_input_tokens_seen": 31481824, + "step": 34800 + }, + { + "epoch": 9.185825524613964, + "grad_norm": 0.003281951881945133, + "learning_rate": 0.01231809778768283, + "loss": 0.014, + "num_input_tokens_seen": 31486240, + "step": 34805 + }, + { + "epoch": 9.18714530816946, + "grad_norm": 0.00022837637516204268, + "learning_rate": 0.012294731447300799, + "loss": 0.0268, + "num_input_tokens_seen": 31490592, + "step": 34810 + }, + { + "epoch": 9.188465091724957, + "grad_norm": 0.0025212173350155354, + "learning_rate": 0.012271386342801671, + "loss": 0.013, + "num_input_tokens_seen": 31495296, + "step": 34815 + }, + { + "epoch": 9.189784875280454, + "grad_norm": 0.0001881404168670997, + "learning_rate": 0.012248062477785565, + "loss": 0.0171, + "num_input_tokens_seen": 31499872, + "step": 34820 + }, + { + "epoch": 9.19110465883595, + "grad_norm": 0.005447556264698505, + "learning_rate": 0.012224759855849305, + "loss": 0.0207, + "num_input_tokens_seen": 31504512, + "step": 34825 + }, + { + "epoch": 9.192424442391447, + "grad_norm": 0.0019671625923365355, + "learning_rate": 0.012201478480586513, + "loss": 0.0076, + "num_input_tokens_seen": 31508992, + "step": 34830 + }, + { + "epoch": 9.193744225946945, + "grad_norm": 0.00017003127140924335, + "learning_rate": 0.012178218355587389, + "loss": 0.0044, + "num_input_tokens_seen": 31513568, + "step": 34835 + }, + { + "epoch": 9.195064009502442, + "grad_norm": 0.0013961568474769592, + "learning_rate": 0.01215497948443896, + "loss": 0.013, + "num_input_tokens_seen": 31517920, + "step": 34840 + }, + { + "epoch": 9.196383793057938, + "grad_norm": 0.00014347424439620227, + "learning_rate": 0.012131761870724993, + "loss": 0.0218, + "num_input_tokens_seen": 31522400, + "step": 34845 + }, + { + "epoch": 9.197703576613435, + "grad_norm": 0.0012397957034409046, + "learning_rate": 0.012108565518025893, + "loss": 0.0054, + "num_input_tokens_seen": 31527072, + "step": 34850 + }, + { + "epoch": 9.199023360168932, + "grad_norm": 0.001780324848368764, + "learning_rate": 0.012085390429918862, + "loss": 0.0143, + "num_input_tokens_seen": 31531712, + "step": 34855 + }, + { + "epoch": 9.20034314372443, + "grad_norm": 0.001040993956848979, + "learning_rate": 0.012062236609977744, + "loss": 0.0158, + "num_input_tokens_seen": 31536032, + "step": 34860 + }, + { + "epoch": 9.201662927279926, + "grad_norm": 0.0024396555963903666, + "learning_rate": 0.01203910406177318, + "loss": 0.0392, + "num_input_tokens_seen": 31540544, + "step": 34865 + }, + { + "epoch": 9.202982710835423, + "grad_norm": 0.0030110045336186886, + "learning_rate": 0.01201599278887252, + "loss": 0.0078, + "num_input_tokens_seen": 31545216, + "step": 34870 + }, + { + "epoch": 9.20430249439092, + "grad_norm": 0.00041135784704238176, + "learning_rate": 0.011992902794839744, + "loss": 0.0081, + "num_input_tokens_seen": 31549312, + "step": 34875 + }, + { + "epoch": 9.205622277946416, + "grad_norm": 0.0004637839738279581, + "learning_rate": 0.011969834083235703, + "loss": 0.0085, + "num_input_tokens_seen": 31553888, + "step": 34880 + }, + { + "epoch": 9.206942061501914, + "grad_norm": 0.0009674050961621106, + "learning_rate": 0.011946786657617836, + "loss": 0.0157, + "num_input_tokens_seen": 31558592, + "step": 34885 + }, + { + "epoch": 9.208261845057411, + "grad_norm": 0.0010048977565020323, + "learning_rate": 0.011923760521540332, + "loss": 0.013, + "num_input_tokens_seen": 31562944, + "step": 34890 + }, + { + "epoch": 9.209581628612908, + "grad_norm": 0.0011987618636339903, + "learning_rate": 0.011900755678554153, + "loss": 0.0272, + "num_input_tokens_seen": 31567168, + "step": 34895 + }, + { + "epoch": 9.210901412168404, + "grad_norm": 0.0009863735176622868, + "learning_rate": 0.011877772132206893, + "loss": 0.0072, + "num_input_tokens_seen": 31571520, + "step": 34900 + }, + { + "epoch": 9.2122211957239, + "grad_norm": 0.0008729357505217195, + "learning_rate": 0.011854809886042915, + "loss": 0.0224, + "num_input_tokens_seen": 31576416, + "step": 34905 + }, + { + "epoch": 9.213540979279399, + "grad_norm": 0.00032410683343186975, + "learning_rate": 0.011831868943603325, + "loss": 0.0212, + "num_input_tokens_seen": 31580736, + "step": 34910 + }, + { + "epoch": 9.214860762834896, + "grad_norm": 0.000515385705512017, + "learning_rate": 0.011808949308425836, + "loss": 0.0147, + "num_input_tokens_seen": 31585248, + "step": 34915 + }, + { + "epoch": 9.216180546390392, + "grad_norm": 0.0005089101614430547, + "learning_rate": 0.01178605098404501, + "loss": 0.0135, + "num_input_tokens_seen": 31590048, + "step": 34920 + }, + { + "epoch": 9.217500329945889, + "grad_norm": 7.663746509933844e-05, + "learning_rate": 0.011763173973992002, + "loss": 0.0127, + "num_input_tokens_seen": 31594432, + "step": 34925 + }, + { + "epoch": 9.218820113501385, + "grad_norm": 0.0007018179749138653, + "learning_rate": 0.011740318281794776, + "loss": 0.0104, + "num_input_tokens_seen": 31598944, + "step": 34930 + }, + { + "epoch": 9.220139897056884, + "grad_norm": 0.0003955959400627762, + "learning_rate": 0.01171748391097796, + "loss": 0.0197, + "num_input_tokens_seen": 31603456, + "step": 34935 + }, + { + "epoch": 9.22145968061238, + "grad_norm": 0.0051323045045137405, + "learning_rate": 0.011694670865062873, + "loss": 0.0416, + "num_input_tokens_seen": 31607872, + "step": 34940 + }, + { + "epoch": 9.222779464167877, + "grad_norm": 0.0003030718071386218, + "learning_rate": 0.011671879147567616, + "loss": 0.0131, + "num_input_tokens_seen": 31612320, + "step": 34945 + }, + { + "epoch": 9.224099247723373, + "grad_norm": 0.004882145673036575, + "learning_rate": 0.011649108762006893, + "loss": 0.0149, + "num_input_tokens_seen": 31616832, + "step": 34950 + }, + { + "epoch": 9.22541903127887, + "grad_norm": 0.0009381487034261227, + "learning_rate": 0.011626359711892265, + "loss": 0.0072, + "num_input_tokens_seen": 31621344, + "step": 34955 + }, + { + "epoch": 9.226738814834366, + "grad_norm": 0.0007388201192952693, + "learning_rate": 0.01160363200073189, + "loss": 0.0057, + "num_input_tokens_seen": 31625920, + "step": 34960 + }, + { + "epoch": 9.228058598389865, + "grad_norm": 0.003919417504221201, + "learning_rate": 0.011580925632030614, + "loss": 0.0164, + "num_input_tokens_seen": 31630496, + "step": 34965 + }, + { + "epoch": 9.229378381945361, + "grad_norm": 0.0009084465564228594, + "learning_rate": 0.011558240609290104, + "loss": 0.0061, + "num_input_tokens_seen": 31635104, + "step": 34970 + }, + { + "epoch": 9.230698165500858, + "grad_norm": 0.003047112375497818, + "learning_rate": 0.011535576936008679, + "loss": 0.0104, + "num_input_tokens_seen": 31639616, + "step": 34975 + }, + { + "epoch": 9.232017949056354, + "grad_norm": 0.0021483649034053087, + "learning_rate": 0.011512934615681309, + "loss": 0.0128, + "num_input_tokens_seen": 31644096, + "step": 34980 + }, + { + "epoch": 9.23333773261185, + "grad_norm": 0.00017772539285942912, + "learning_rate": 0.011490313651799765, + "loss": 0.0411, + "num_input_tokens_seen": 31648608, + "step": 34985 + }, + { + "epoch": 9.23465751616735, + "grad_norm": 0.004606195259839296, + "learning_rate": 0.011467714047852512, + "loss": 0.0145, + "num_input_tokens_seen": 31652704, + "step": 34990 + }, + { + "epoch": 9.235977299722846, + "grad_norm": 0.0012191651621833444, + "learning_rate": 0.011445135807324624, + "loss": 0.0078, + "num_input_tokens_seen": 31656896, + "step": 34995 + }, + { + "epoch": 9.237297083278342, + "grad_norm": 0.0032939151860773563, + "learning_rate": 0.011422578933698002, + "loss": 0.0262, + "num_input_tokens_seen": 31661536, + "step": 35000 + }, + { + "epoch": 9.237297083278342, + "eval_loss": 0.109819695353508, + "eval_runtime": 75.8854, + "eval_samples_per_second": 88.752, + "eval_steps_per_second": 22.191, + "num_input_tokens_seen": 31661536, + "step": 35000 + }, + { + "epoch": 9.238616866833839, + "grad_norm": 8.283637725980952e-05, + "learning_rate": 0.011400043430451161, + "loss": 0.0113, + "num_input_tokens_seen": 31666048, + "step": 35005 + }, + { + "epoch": 9.239936650389335, + "grad_norm": 0.0008041912224143744, + "learning_rate": 0.011377529301059392, + "loss": 0.0226, + "num_input_tokens_seen": 31670432, + "step": 35010 + }, + { + "epoch": 9.241256433944834, + "grad_norm": 0.001944412593729794, + "learning_rate": 0.011355036548994646, + "loss": 0.0281, + "num_input_tokens_seen": 31674944, + "step": 35015 + }, + { + "epoch": 9.24257621750033, + "grad_norm": 0.004583792295306921, + "learning_rate": 0.011332565177725584, + "loss": 0.0203, + "num_input_tokens_seen": 31679744, + "step": 35020 + }, + { + "epoch": 9.243896001055827, + "grad_norm": 0.0009482185705564916, + "learning_rate": 0.011310115190717585, + "loss": 0.0091, + "num_input_tokens_seen": 31683968, + "step": 35025 + }, + { + "epoch": 9.245215784611323, + "grad_norm": 0.0013909486588090658, + "learning_rate": 0.01128768659143271, + "loss": 0.0111, + "num_input_tokens_seen": 31688416, + "step": 35030 + }, + { + "epoch": 9.24653556816682, + "grad_norm": 0.0004441465134732425, + "learning_rate": 0.011265279383329713, + "loss": 0.0091, + "num_input_tokens_seen": 31693280, + "step": 35035 + }, + { + "epoch": 9.247855351722318, + "grad_norm": 0.0001445785310352221, + "learning_rate": 0.01124289356986411, + "loss": 0.0226, + "num_input_tokens_seen": 31697856, + "step": 35040 + }, + { + "epoch": 9.249175135277815, + "grad_norm": 0.0007907090475782752, + "learning_rate": 0.011220529154488023, + "loss": 0.0123, + "num_input_tokens_seen": 31702688, + "step": 35045 + }, + { + "epoch": 9.250494918833311, + "grad_norm": 0.0010366464266553521, + "learning_rate": 0.011198186140650346, + "loss": 0.0069, + "num_input_tokens_seen": 31706976, + "step": 35050 + }, + { + "epoch": 9.251814702388808, + "grad_norm": 0.0009249930735677481, + "learning_rate": 0.011175864531796685, + "loss": 0.0248, + "num_input_tokens_seen": 31711808, + "step": 35055 + }, + { + "epoch": 9.253134485944305, + "grad_norm": 0.003948194440454245, + "learning_rate": 0.011153564331369258, + "loss": 0.0182, + "num_input_tokens_seen": 31716448, + "step": 35060 + }, + { + "epoch": 9.254454269499803, + "grad_norm": 0.0043702758848667145, + "learning_rate": 0.011131285542807078, + "loss": 0.0333, + "num_input_tokens_seen": 31720960, + "step": 35065 + }, + { + "epoch": 9.2557740530553, + "grad_norm": 0.006371313706040382, + "learning_rate": 0.011109028169545815, + "loss": 0.0257, + "num_input_tokens_seen": 31725696, + "step": 35070 + }, + { + "epoch": 9.257093836610796, + "grad_norm": 0.0013466362142935395, + "learning_rate": 0.011086792215017804, + "loss": 0.0255, + "num_input_tokens_seen": 31730240, + "step": 35075 + }, + { + "epoch": 9.258413620166293, + "grad_norm": 0.0017263927729800344, + "learning_rate": 0.011064577682652137, + "loss": 0.0422, + "num_input_tokens_seen": 31734880, + "step": 35080 + }, + { + "epoch": 9.259733403721789, + "grad_norm": 0.0009751687757670879, + "learning_rate": 0.011042384575874559, + "loss": 0.0149, + "num_input_tokens_seen": 31739392, + "step": 35085 + }, + { + "epoch": 9.261053187277287, + "grad_norm": 0.0006851436919532716, + "learning_rate": 0.011020212898107512, + "loss": 0.0204, + "num_input_tokens_seen": 31743648, + "step": 35090 + }, + { + "epoch": 9.262372970832784, + "grad_norm": 0.0006352120544761419, + "learning_rate": 0.010998062652770197, + "loss": 0.008, + "num_input_tokens_seen": 31748320, + "step": 35095 + }, + { + "epoch": 9.26369275438828, + "grad_norm": 0.0042565856128931046, + "learning_rate": 0.010975933843278428, + "loss": 0.0323, + "num_input_tokens_seen": 31752704, + "step": 35100 + }, + { + "epoch": 9.265012537943777, + "grad_norm": 0.00043468945659697056, + "learning_rate": 0.010953826473044714, + "loss": 0.0118, + "num_input_tokens_seen": 31756864, + "step": 35105 + }, + { + "epoch": 9.266332321499274, + "grad_norm": 0.004622374661266804, + "learning_rate": 0.010931740545478357, + "loss": 0.0443, + "num_input_tokens_seen": 31761152, + "step": 35110 + }, + { + "epoch": 9.26765210505477, + "grad_norm": 0.004260319285094738, + "learning_rate": 0.010909676063985218, + "loss": 0.0244, + "num_input_tokens_seen": 31765536, + "step": 35115 + }, + { + "epoch": 9.268971888610269, + "grad_norm": 0.0003330945037305355, + "learning_rate": 0.010887633031967974, + "loss": 0.0366, + "num_input_tokens_seen": 31770080, + "step": 35120 + }, + { + "epoch": 9.270291672165765, + "grad_norm": 0.003868361935019493, + "learning_rate": 0.01086561145282589, + "loss": 0.0263, + "num_input_tokens_seen": 31774240, + "step": 35125 + }, + { + "epoch": 9.271611455721262, + "grad_norm": 0.004864483140408993, + "learning_rate": 0.010843611329954983, + "loss": 0.0237, + "num_input_tokens_seen": 31778624, + "step": 35130 + }, + { + "epoch": 9.272931239276758, + "grad_norm": 0.0008055620128288865, + "learning_rate": 0.010821632666747988, + "loss": 0.0164, + "num_input_tokens_seen": 31783040, + "step": 35135 + }, + { + "epoch": 9.274251022832255, + "grad_norm": 0.001363382674753666, + "learning_rate": 0.010799675466594244, + "loss": 0.0204, + "num_input_tokens_seen": 31787840, + "step": 35140 + }, + { + "epoch": 9.275570806387753, + "grad_norm": 0.0007898684125393629, + "learning_rate": 0.010777739732879826, + "loss": 0.0097, + "num_input_tokens_seen": 31792160, + "step": 35145 + }, + { + "epoch": 9.27689058994325, + "grad_norm": 0.001695849816314876, + "learning_rate": 0.010755825468987562, + "loss": 0.0241, + "num_input_tokens_seen": 31797056, + "step": 35150 + }, + { + "epoch": 9.278210373498746, + "grad_norm": 0.0012145418440923095, + "learning_rate": 0.010733932678296814, + "loss": 0.0169, + "num_input_tokens_seen": 31801568, + "step": 35155 + }, + { + "epoch": 9.279530157054243, + "grad_norm": 0.0001504808897152543, + "learning_rate": 0.010712061364183817, + "loss": 0.0169, + "num_input_tokens_seen": 31805952, + "step": 35160 + }, + { + "epoch": 9.28084994060974, + "grad_norm": 0.00040370519855059683, + "learning_rate": 0.010690211530021337, + "loss": 0.0086, + "num_input_tokens_seen": 31810528, + "step": 35165 + }, + { + "epoch": 9.282169724165238, + "grad_norm": 0.0002154841204173863, + "learning_rate": 0.01066838317917893, + "loss": 0.0169, + "num_input_tokens_seen": 31814624, + "step": 35170 + }, + { + "epoch": 9.283489507720734, + "grad_norm": 0.0030958130955696106, + "learning_rate": 0.010646576315022787, + "loss": 0.0129, + "num_input_tokens_seen": 31818944, + "step": 35175 + }, + { + "epoch": 9.28480929127623, + "grad_norm": 0.001028845552355051, + "learning_rate": 0.010624790940915785, + "loss": 0.014, + "num_input_tokens_seen": 31823680, + "step": 35180 + }, + { + "epoch": 9.286129074831727, + "grad_norm": 0.002359787467867136, + "learning_rate": 0.0106030270602175, + "loss": 0.0204, + "num_input_tokens_seen": 31828000, + "step": 35185 + }, + { + "epoch": 9.287448858387224, + "grad_norm": 0.001191906281746924, + "learning_rate": 0.010581284676284252, + "loss": 0.0216, + "num_input_tokens_seen": 31832640, + "step": 35190 + }, + { + "epoch": 9.288768641942722, + "grad_norm": 0.004030841402709484, + "learning_rate": 0.010559563792468923, + "loss": 0.0106, + "num_input_tokens_seen": 31837280, + "step": 35195 + }, + { + "epoch": 9.290088425498219, + "grad_norm": 0.0009127891971729696, + "learning_rate": 0.010537864412121217, + "loss": 0.0061, + "num_input_tokens_seen": 31842016, + "step": 35200 + }, + { + "epoch": 9.290088425498219, + "eval_loss": 0.11015855520963669, + "eval_runtime": 75.8283, + "eval_samples_per_second": 88.819, + "eval_steps_per_second": 22.208, + "num_input_tokens_seen": 31842016, + "step": 35200 + }, + { + "epoch": 9.291408209053715, + "grad_norm": 0.0010068680858239532, + "learning_rate": 0.010516186538587357, + "loss": 0.0448, + "num_input_tokens_seen": 31846400, + "step": 35205 + }, + { + "epoch": 9.292727992609212, + "grad_norm": 8.826998237054795e-05, + "learning_rate": 0.01049453017521042, + "loss": 0.0117, + "num_input_tokens_seen": 31850912, + "step": 35210 + }, + { + "epoch": 9.294047776164708, + "grad_norm": 0.0009841728024184704, + "learning_rate": 0.010472895325330083, + "loss": 0.01, + "num_input_tokens_seen": 31855296, + "step": 35215 + }, + { + "epoch": 9.295367559720205, + "grad_norm": 0.001771582872606814, + "learning_rate": 0.010451281992282662, + "loss": 0.0326, + "num_input_tokens_seen": 31859648, + "step": 35220 + }, + { + "epoch": 9.296687343275703, + "grad_norm": 0.0016878460301086307, + "learning_rate": 0.01042969017940124, + "loss": 0.0108, + "num_input_tokens_seen": 31864576, + "step": 35225 + }, + { + "epoch": 9.2980071268312, + "grad_norm": 0.0006578727043233812, + "learning_rate": 0.01040811989001557, + "loss": 0.0213, + "num_input_tokens_seen": 31868928, + "step": 35230 + }, + { + "epoch": 9.299326910386696, + "grad_norm": 0.001746877795085311, + "learning_rate": 0.010386571127451992, + "loss": 0.0093, + "num_input_tokens_seen": 31873536, + "step": 35235 + }, + { + "epoch": 9.300646693942193, + "grad_norm": 0.0002772475709207356, + "learning_rate": 0.010365043895033682, + "loss": 0.0042, + "num_input_tokens_seen": 31877984, + "step": 35240 + }, + { + "epoch": 9.30196647749769, + "grad_norm": 0.00161816889885813, + "learning_rate": 0.010343538196080365, + "loss": 0.0042, + "num_input_tokens_seen": 31882560, + "step": 35245 + }, + { + "epoch": 9.303286261053188, + "grad_norm": 0.0019386375788599253, + "learning_rate": 0.010322054033908457, + "loss": 0.0289, + "num_input_tokens_seen": 31887200, + "step": 35250 + }, + { + "epoch": 9.304606044608684, + "grad_norm": 0.0015694652684032917, + "learning_rate": 0.010300591411831156, + "loss": 0.0174, + "num_input_tokens_seen": 31891712, + "step": 35255 + }, + { + "epoch": 9.305925828164181, + "grad_norm": 0.0005018912488594651, + "learning_rate": 0.010279150333158198, + "loss": 0.0129, + "num_input_tokens_seen": 31896192, + "step": 35260 + }, + { + "epoch": 9.307245611719678, + "grad_norm": 0.00266477488912642, + "learning_rate": 0.010257730801196107, + "loss": 0.0081, + "num_input_tokens_seen": 31900672, + "step": 35265 + }, + { + "epoch": 9.308565395275174, + "grad_norm": 0.0009296797798015177, + "learning_rate": 0.010236332819248056, + "loss": 0.0247, + "num_input_tokens_seen": 31905216, + "step": 35270 + }, + { + "epoch": 9.309885178830672, + "grad_norm": 0.0026643730234354734, + "learning_rate": 0.010214956390613854, + "loss": 0.031, + "num_input_tokens_seen": 31909760, + "step": 35275 + }, + { + "epoch": 9.311204962386169, + "grad_norm": 0.0016385894268751144, + "learning_rate": 0.010193601518590034, + "loss": 0.0251, + "num_input_tokens_seen": 31914144, + "step": 35280 + }, + { + "epoch": 9.312524745941666, + "grad_norm": 0.0029575019143521786, + "learning_rate": 0.010172268206469758, + "loss": 0.0521, + "num_input_tokens_seen": 31918976, + "step": 35285 + }, + { + "epoch": 9.313844529497162, + "grad_norm": 0.0008625427726656199, + "learning_rate": 0.010150956457542897, + "loss": 0.0106, + "num_input_tokens_seen": 31923712, + "step": 35290 + }, + { + "epoch": 9.315164313052659, + "grad_norm": 0.00028018580633215606, + "learning_rate": 0.010129666275096054, + "loss": 0.0037, + "num_input_tokens_seen": 31927808, + "step": 35295 + }, + { + "epoch": 9.316484096608157, + "grad_norm": 0.001236005569808185, + "learning_rate": 0.010108397662412338, + "loss": 0.0238, + "num_input_tokens_seen": 31932032, + "step": 35300 + }, + { + "epoch": 9.317803880163654, + "grad_norm": 0.0014541142154484987, + "learning_rate": 0.010087150622771707, + "loss": 0.0086, + "num_input_tokens_seen": 31936768, + "step": 35305 + }, + { + "epoch": 9.31912366371915, + "grad_norm": 0.00038146492443047464, + "learning_rate": 0.010065925159450739, + "loss": 0.0101, + "num_input_tokens_seen": 31941376, + "step": 35310 + }, + { + "epoch": 9.320443447274647, + "grad_norm": 0.0001636487722862512, + "learning_rate": 0.010044721275722618, + "loss": 0.0028, + "num_input_tokens_seen": 31945920, + "step": 35315 + }, + { + "epoch": 9.321763230830143, + "grad_norm": 0.000685185834299773, + "learning_rate": 0.01002353897485726, + "loss": 0.0096, + "num_input_tokens_seen": 31950464, + "step": 35320 + }, + { + "epoch": 9.323083014385642, + "grad_norm": 0.0008670947281643748, + "learning_rate": 0.010002378260121236, + "loss": 0.0161, + "num_input_tokens_seen": 31954880, + "step": 35325 + }, + { + "epoch": 9.324402797941138, + "grad_norm": 0.004276195541024208, + "learning_rate": 0.009981239134777786, + "loss": 0.026, + "num_input_tokens_seen": 31959328, + "step": 35330 + }, + { + "epoch": 9.325722581496635, + "grad_norm": 0.0006710073794238269, + "learning_rate": 0.009960121602086884, + "loss": 0.0079, + "num_input_tokens_seen": 31963712, + "step": 35335 + }, + { + "epoch": 9.327042365052131, + "grad_norm": 0.0004148497828282416, + "learning_rate": 0.009939025665305062, + "loss": 0.0218, + "num_input_tokens_seen": 31968000, + "step": 35340 + }, + { + "epoch": 9.328362148607628, + "grad_norm": 0.0007235205266624689, + "learning_rate": 0.009917951327685597, + "loss": 0.0134, + "num_input_tokens_seen": 31972320, + "step": 35345 + }, + { + "epoch": 9.329681932163126, + "grad_norm": 0.001863925950601697, + "learning_rate": 0.009896898592478425, + "loss": 0.0382, + "num_input_tokens_seen": 31976736, + "step": 35350 + }, + { + "epoch": 9.331001715718623, + "grad_norm": 0.0003995405277237296, + "learning_rate": 0.009875867462930132, + "loss": 0.0077, + "num_input_tokens_seen": 31981024, + "step": 35355 + }, + { + "epoch": 9.33232149927412, + "grad_norm": 0.002118954434990883, + "learning_rate": 0.009854857942284006, + "loss": 0.0094, + "num_input_tokens_seen": 31985408, + "step": 35360 + }, + { + "epoch": 9.333641282829616, + "grad_norm": 0.002231516409665346, + "learning_rate": 0.009833870033779923, + "loss": 0.0191, + "num_input_tokens_seen": 31989632, + "step": 35365 + }, + { + "epoch": 9.334961066385112, + "grad_norm": 0.0008695528376847506, + "learning_rate": 0.009812903740654527, + "loss": 0.0029, + "num_input_tokens_seen": 31994304, + "step": 35370 + }, + { + "epoch": 9.336280849940609, + "grad_norm": 0.00015062217426020652, + "learning_rate": 0.009791959066141097, + "loss": 0.0393, + "num_input_tokens_seen": 31999008, + "step": 35375 + }, + { + "epoch": 9.337600633496107, + "grad_norm": 0.0005757714388892055, + "learning_rate": 0.009771036013469537, + "loss": 0.0088, + "num_input_tokens_seen": 32003616, + "step": 35380 + }, + { + "epoch": 9.338920417051604, + "grad_norm": 0.0014029026497155428, + "learning_rate": 0.00975013458586646, + "loss": 0.0137, + "num_input_tokens_seen": 32008352, + "step": 35385 + }, + { + "epoch": 9.3402402006071, + "grad_norm": 0.003813413204625249, + "learning_rate": 0.009729254786555107, + "loss": 0.0112, + "num_input_tokens_seen": 32012800, + "step": 35390 + }, + { + "epoch": 9.341559984162597, + "grad_norm": 0.0008579494897276163, + "learning_rate": 0.009708396618755421, + "loss": 0.0192, + "num_input_tokens_seen": 32016992, + "step": 35395 + }, + { + "epoch": 9.342879767718093, + "grad_norm": 0.0011539157712832093, + "learning_rate": 0.009687560085683994, + "loss": 0.0098, + "num_input_tokens_seen": 32021408, + "step": 35400 + }, + { + "epoch": 9.342879767718093, + "eval_loss": 0.11059147864580154, + "eval_runtime": 75.8796, + "eval_samples_per_second": 88.759, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 32021408, + "step": 35400 + }, + { + "epoch": 9.344199551273592, + "grad_norm": 0.0003652485611382872, + "learning_rate": 0.009666745190554054, + "loss": 0.0204, + "num_input_tokens_seen": 32025984, + "step": 35405 + }, + { + "epoch": 9.345519334829088, + "grad_norm": 0.0017317214515060186, + "learning_rate": 0.009645951936575553, + "loss": 0.027, + "num_input_tokens_seen": 32030368, + "step": 35410 + }, + { + "epoch": 9.346839118384585, + "grad_norm": 0.0023526304867118597, + "learning_rate": 0.00962518032695509, + "loss": 0.0059, + "num_input_tokens_seen": 32035008, + "step": 35415 + }, + { + "epoch": 9.348158901940081, + "grad_norm": 0.0007533286698162556, + "learning_rate": 0.009604430364895855, + "loss": 0.0262, + "num_input_tokens_seen": 32039520, + "step": 35420 + }, + { + "epoch": 9.349478685495578, + "grad_norm": 0.0005991508369334042, + "learning_rate": 0.00958370205359777, + "loss": 0.0299, + "num_input_tokens_seen": 32043968, + "step": 35425 + }, + { + "epoch": 9.350798469051076, + "grad_norm": 0.0003081108152400702, + "learning_rate": 0.009562995396257445, + "loss": 0.0103, + "num_input_tokens_seen": 32048480, + "step": 35430 + }, + { + "epoch": 9.352118252606573, + "grad_norm": 0.003964891657233238, + "learning_rate": 0.009542310396068026, + "loss": 0.0156, + "num_input_tokens_seen": 32052960, + "step": 35435 + }, + { + "epoch": 9.35343803616207, + "grad_norm": 0.0007954670581966639, + "learning_rate": 0.009521647056219495, + "loss": 0.0086, + "num_input_tokens_seen": 32057536, + "step": 35440 + }, + { + "epoch": 9.354757819717566, + "grad_norm": 0.0017333251889795065, + "learning_rate": 0.00950100537989832, + "loss": 0.0133, + "num_input_tokens_seen": 32062048, + "step": 35445 + }, + { + "epoch": 9.356077603273063, + "grad_norm": 0.0009866268374025822, + "learning_rate": 0.00948038537028772, + "loss": 0.0167, + "num_input_tokens_seen": 32066272, + "step": 35450 + }, + { + "epoch": 9.357397386828561, + "grad_norm": 0.0045330096036195755, + "learning_rate": 0.009459787030567617, + "loss": 0.0564, + "num_input_tokens_seen": 32070464, + "step": 35455 + }, + { + "epoch": 9.358717170384057, + "grad_norm": 0.0017357268370687962, + "learning_rate": 0.00943921036391449, + "loss": 0.0072, + "num_input_tokens_seen": 32074880, + "step": 35460 + }, + { + "epoch": 9.360036953939554, + "grad_norm": 0.0008622377063147724, + "learning_rate": 0.009418655373501483, + "loss": 0.0185, + "num_input_tokens_seen": 32079648, + "step": 35465 + }, + { + "epoch": 9.36135673749505, + "grad_norm": 0.0005916974623687565, + "learning_rate": 0.00939812206249851, + "loss": 0.0039, + "num_input_tokens_seen": 32084416, + "step": 35470 + }, + { + "epoch": 9.362676521050547, + "grad_norm": 0.004873642697930336, + "learning_rate": 0.009377610434072004, + "loss": 0.0306, + "num_input_tokens_seen": 32088992, + "step": 35475 + }, + { + "epoch": 9.363996304606044, + "grad_norm": 0.0008388335700146854, + "learning_rate": 0.009357120491385167, + "loss": 0.0087, + "num_input_tokens_seen": 32093600, + "step": 35480 + }, + { + "epoch": 9.365316088161542, + "grad_norm": 0.0009941038442775607, + "learning_rate": 0.009336652237597743, + "loss": 0.0122, + "num_input_tokens_seen": 32098080, + "step": 35485 + }, + { + "epoch": 9.366635871717039, + "grad_norm": 0.0006558356690220535, + "learning_rate": 0.009316205675866251, + "loss": 0.029, + "num_input_tokens_seen": 32102688, + "step": 35490 + }, + { + "epoch": 9.367955655272535, + "grad_norm": 0.0006739182863384485, + "learning_rate": 0.00929578080934379, + "loss": 0.0609, + "num_input_tokens_seen": 32107008, + "step": 35495 + }, + { + "epoch": 9.369275438828032, + "grad_norm": 0.0004828079545404762, + "learning_rate": 0.00927537764118012, + "loss": 0.0157, + "num_input_tokens_seen": 32111360, + "step": 35500 + }, + { + "epoch": 9.370595222383528, + "grad_norm": 0.0031999112106859684, + "learning_rate": 0.009254996174521678, + "loss": 0.0094, + "num_input_tokens_seen": 32115840, + "step": 35505 + }, + { + "epoch": 9.371915005939027, + "grad_norm": 0.0017536060186102986, + "learning_rate": 0.009234636412511531, + "loss": 0.0236, + "num_input_tokens_seen": 32120512, + "step": 35510 + }, + { + "epoch": 9.373234789494523, + "grad_norm": 0.0020917628426104784, + "learning_rate": 0.009214298358289418, + "loss": 0.024, + "num_input_tokens_seen": 32124992, + "step": 35515 + }, + { + "epoch": 9.37455457305002, + "grad_norm": 0.0007357831345871091, + "learning_rate": 0.00919398201499173, + "loss": 0.0111, + "num_input_tokens_seen": 32129440, + "step": 35520 + }, + { + "epoch": 9.375874356605516, + "grad_norm": 0.004068072885274887, + "learning_rate": 0.009173687385751495, + "loss": 0.0302, + "num_input_tokens_seen": 32134464, + "step": 35525 + }, + { + "epoch": 9.377194140161013, + "grad_norm": 0.0027074136305600405, + "learning_rate": 0.009153414473698407, + "loss": 0.0179, + "num_input_tokens_seen": 32138816, + "step": 35530 + }, + { + "epoch": 9.378513923716511, + "grad_norm": 0.0004429125692695379, + "learning_rate": 0.009133163281958784, + "loss": 0.0232, + "num_input_tokens_seen": 32143488, + "step": 35535 + }, + { + "epoch": 9.379833707272008, + "grad_norm": 0.0003714966878760606, + "learning_rate": 0.009112933813655627, + "loss": 0.0038, + "num_input_tokens_seen": 32148032, + "step": 35540 + }, + { + "epoch": 9.381153490827504, + "grad_norm": 0.003031749976798892, + "learning_rate": 0.009092726071908573, + "loss": 0.0117, + "num_input_tokens_seen": 32152736, + "step": 35545 + }, + { + "epoch": 9.382473274383, + "grad_norm": 0.002340474631637335, + "learning_rate": 0.0090725400598339, + "loss": 0.0107, + "num_input_tokens_seen": 32157376, + "step": 35550 + }, + { + "epoch": 9.383793057938497, + "grad_norm": 0.004544905852526426, + "learning_rate": 0.009052375780544563, + "loss": 0.0108, + "num_input_tokens_seen": 32161984, + "step": 35555 + }, + { + "epoch": 9.385112841493996, + "grad_norm": 0.00471776956692338, + "learning_rate": 0.009032233237150144, + "loss": 0.0151, + "num_input_tokens_seen": 32166752, + "step": 35560 + }, + { + "epoch": 9.386432625049492, + "grad_norm": 0.001165055320598185, + "learning_rate": 0.009012112432756875, + "loss": 0.0109, + "num_input_tokens_seen": 32171232, + "step": 35565 + }, + { + "epoch": 9.387752408604989, + "grad_norm": 0.000580702384468168, + "learning_rate": 0.008992013370467605, + "loss": 0.0032, + "num_input_tokens_seen": 32175808, + "step": 35570 + }, + { + "epoch": 9.389072192160485, + "grad_norm": 0.0009232331649400294, + "learning_rate": 0.008971936053381924, + "loss": 0.0132, + "num_input_tokens_seen": 32180384, + "step": 35575 + }, + { + "epoch": 9.390391975715982, + "grad_norm": 0.0003441621083766222, + "learning_rate": 0.008951880484595953, + "loss": 0.0102, + "num_input_tokens_seen": 32184640, + "step": 35580 + }, + { + "epoch": 9.39171175927148, + "grad_norm": 0.0005597700364887714, + "learning_rate": 0.008931846667202552, + "loss": 0.0046, + "num_input_tokens_seen": 32189088, + "step": 35585 + }, + { + "epoch": 9.393031542826977, + "grad_norm": 0.0008736743475310504, + "learning_rate": 0.008911834604291152, + "loss": 0.0052, + "num_input_tokens_seen": 32193728, + "step": 35590 + }, + { + "epoch": 9.394351326382473, + "grad_norm": 0.002142898738384247, + "learning_rate": 0.008891844298947882, + "loss": 0.0259, + "num_input_tokens_seen": 32197984, + "step": 35595 + }, + { + "epoch": 9.39567110993797, + "grad_norm": 0.004494826775044203, + "learning_rate": 0.008871875754255508, + "loss": 0.0155, + "num_input_tokens_seen": 32202368, + "step": 35600 + }, + { + "epoch": 9.39567110993797, + "eval_loss": 0.11111979931592941, + "eval_runtime": 75.9026, + "eval_samples_per_second": 88.732, + "eval_steps_per_second": 22.186, + "num_input_tokens_seen": 32202368, + "step": 35600 + }, + { + "epoch": 9.396990893493467, + "grad_norm": 0.0005501205450855196, + "learning_rate": 0.008851928973293422, + "loss": 0.0516, + "num_input_tokens_seen": 32206944, + "step": 35605 + }, + { + "epoch": 9.398310677048965, + "grad_norm": 0.00362643925473094, + "learning_rate": 0.00883200395913764, + "loss": 0.0284, + "num_input_tokens_seen": 32211328, + "step": 35610 + }, + { + "epoch": 9.399630460604461, + "grad_norm": 0.0004082602681592107, + "learning_rate": 0.00881210071486091, + "loss": 0.0118, + "num_input_tokens_seen": 32215616, + "step": 35615 + }, + { + "epoch": 9.400950244159958, + "grad_norm": 0.0025936602614820004, + "learning_rate": 0.008792219243532505, + "loss": 0.0101, + "num_input_tokens_seen": 32220128, + "step": 35620 + }, + { + "epoch": 9.402270027715455, + "grad_norm": 0.0019959236960858107, + "learning_rate": 0.008772359548218428, + "loss": 0.0128, + "num_input_tokens_seen": 32224736, + "step": 35625 + }, + { + "epoch": 9.403589811270951, + "grad_norm": 0.004163950681686401, + "learning_rate": 0.008752521631981274, + "loss": 0.0297, + "num_input_tokens_seen": 32229024, + "step": 35630 + }, + { + "epoch": 9.404909594826448, + "grad_norm": 0.002704859944060445, + "learning_rate": 0.008732705497880315, + "loss": 0.0219, + "num_input_tokens_seen": 32233472, + "step": 35635 + }, + { + "epoch": 9.406229378381946, + "grad_norm": 0.0005295369774103165, + "learning_rate": 0.008712911148971459, + "loss": 0.0076, + "num_input_tokens_seen": 32237664, + "step": 35640 + }, + { + "epoch": 9.407549161937443, + "grad_norm": 0.0011010158341377974, + "learning_rate": 0.008693138588307208, + "loss": 0.0055, + "num_input_tokens_seen": 32242304, + "step": 35645 + }, + { + "epoch": 9.408868945492939, + "grad_norm": 0.004725626669824123, + "learning_rate": 0.008673387818936762, + "loss": 0.0246, + "num_input_tokens_seen": 32246688, + "step": 35650 + }, + { + "epoch": 9.410188729048436, + "grad_norm": 0.0011454460909590125, + "learning_rate": 0.008653658843905948, + "loss": 0.0243, + "num_input_tokens_seen": 32251200, + "step": 35655 + }, + { + "epoch": 9.411508512603932, + "grad_norm": 0.00283124390989542, + "learning_rate": 0.0086339516662572, + "loss": 0.0302, + "num_input_tokens_seen": 32255616, + "step": 35660 + }, + { + "epoch": 9.41282829615943, + "grad_norm": 0.0001921970397233963, + "learning_rate": 0.008614266289029638, + "loss": 0.0127, + "num_input_tokens_seen": 32259936, + "step": 35665 + }, + { + "epoch": 9.414148079714927, + "grad_norm": 0.0057083964347839355, + "learning_rate": 0.008594602715258965, + "loss": 0.0265, + "num_input_tokens_seen": 32264608, + "step": 35670 + }, + { + "epoch": 9.415467863270424, + "grad_norm": 0.0006964633939787745, + "learning_rate": 0.008574960947977573, + "loss": 0.0135, + "num_input_tokens_seen": 32269184, + "step": 35675 + }, + { + "epoch": 9.41678764682592, + "grad_norm": 0.0024171979166567326, + "learning_rate": 0.008555340990214438, + "loss": 0.0134, + "num_input_tokens_seen": 32273440, + "step": 35680 + }, + { + "epoch": 9.418107430381417, + "grad_norm": 0.0003881182346958667, + "learning_rate": 0.008535742844995258, + "loss": 0.0164, + "num_input_tokens_seen": 32277664, + "step": 35685 + }, + { + "epoch": 9.419427213936915, + "grad_norm": 0.0025233542546629906, + "learning_rate": 0.008516166515342266, + "loss": 0.0084, + "num_input_tokens_seen": 32282272, + "step": 35690 + }, + { + "epoch": 9.420746997492412, + "grad_norm": 0.0002752490690909326, + "learning_rate": 0.008496612004274411, + "loss": 0.02, + "num_input_tokens_seen": 32286848, + "step": 35695 + }, + { + "epoch": 9.422066781047908, + "grad_norm": 0.0006465644109994173, + "learning_rate": 0.008477079314807201, + "loss": 0.0148, + "num_input_tokens_seen": 32291264, + "step": 35700 + }, + { + "epoch": 9.423386564603405, + "grad_norm": 0.0010072115110233426, + "learning_rate": 0.008457568449952874, + "loss": 0.0406, + "num_input_tokens_seen": 32295968, + "step": 35705 + }, + { + "epoch": 9.424706348158901, + "grad_norm": 0.0038262149319052696, + "learning_rate": 0.008438079412720189, + "loss": 0.024, + "num_input_tokens_seen": 32300448, + "step": 35710 + }, + { + "epoch": 9.4260261317144, + "grad_norm": 0.002193362917751074, + "learning_rate": 0.00841861220611466, + "loss": 0.0047, + "num_input_tokens_seen": 32305440, + "step": 35715 + }, + { + "epoch": 9.427345915269896, + "grad_norm": 0.0037002037279307842, + "learning_rate": 0.008399166833138355, + "loss": 0.0303, + "num_input_tokens_seen": 32309856, + "step": 35720 + }, + { + "epoch": 9.428665698825393, + "grad_norm": 0.0008081693085841835, + "learning_rate": 0.008379743296789987, + "loss": 0.0051, + "num_input_tokens_seen": 32314432, + "step": 35725 + }, + { + "epoch": 9.42998548238089, + "grad_norm": 0.0032314248383045197, + "learning_rate": 0.008360341600064896, + "loss": 0.0199, + "num_input_tokens_seen": 32318720, + "step": 35730 + }, + { + "epoch": 9.431305265936386, + "grad_norm": 0.00037384688039310277, + "learning_rate": 0.008340961745955121, + "loss": 0.0201, + "num_input_tokens_seen": 32323200, + "step": 35735 + }, + { + "epoch": 9.432625049491884, + "grad_norm": 0.001974450657144189, + "learning_rate": 0.008321603737449224, + "loss": 0.019, + "num_input_tokens_seen": 32327872, + "step": 35740 + }, + { + "epoch": 9.43394483304738, + "grad_norm": 0.0014464674750342965, + "learning_rate": 0.008302267577532479, + "loss": 0.0212, + "num_input_tokens_seen": 32332512, + "step": 35745 + }, + { + "epoch": 9.435264616602877, + "grad_norm": 0.0008764438680373132, + "learning_rate": 0.008282953269186771, + "loss": 0.0076, + "num_input_tokens_seen": 32336832, + "step": 35750 + }, + { + "epoch": 9.436584400158374, + "grad_norm": 0.001504596439190209, + "learning_rate": 0.008263660815390567, + "loss": 0.0143, + "num_input_tokens_seen": 32341280, + "step": 35755 + }, + { + "epoch": 9.43790418371387, + "grad_norm": 0.0023474623449146748, + "learning_rate": 0.008244390219119069, + "loss": 0.0098, + "num_input_tokens_seen": 32345856, + "step": 35760 + }, + { + "epoch": 9.439223967269367, + "grad_norm": 0.0001615009387023747, + "learning_rate": 0.008225141483343967, + "loss": 0.0516, + "num_input_tokens_seen": 32350272, + "step": 35765 + }, + { + "epoch": 9.440543750824865, + "grad_norm": 0.0008670742390677333, + "learning_rate": 0.00820591461103372, + "loss": 0.0386, + "num_input_tokens_seen": 32354880, + "step": 35770 + }, + { + "epoch": 9.441863534380362, + "grad_norm": 0.0025534403976053, + "learning_rate": 0.008186709605153358, + "loss": 0.0512, + "num_input_tokens_seen": 32359136, + "step": 35775 + }, + { + "epoch": 9.443183317935858, + "grad_norm": 0.0027436784002929926, + "learning_rate": 0.008167526468664492, + "loss": 0.0152, + "num_input_tokens_seen": 32363968, + "step": 35780 + }, + { + "epoch": 9.444503101491355, + "grad_norm": 0.0017120195552706718, + "learning_rate": 0.008148365204525443, + "loss": 0.0208, + "num_input_tokens_seen": 32368288, + "step": 35785 + }, + { + "epoch": 9.445822885046852, + "grad_norm": 0.00020388173288665712, + "learning_rate": 0.00812922581569106, + "loss": 0.0142, + "num_input_tokens_seen": 32372704, + "step": 35790 + }, + { + "epoch": 9.44714266860235, + "grad_norm": 0.00014817861665505916, + "learning_rate": 0.008110108305112934, + "loss": 0.0045, + "num_input_tokens_seen": 32376960, + "step": 35795 + }, + { + "epoch": 9.448462452157846, + "grad_norm": 0.0010432485723868012, + "learning_rate": 0.008091012675739223, + "loss": 0.0077, + "num_input_tokens_seen": 32381184, + "step": 35800 + }, + { + "epoch": 9.448462452157846, + "eval_loss": 0.10962583124637604, + "eval_runtime": 75.8729, + "eval_samples_per_second": 88.767, + "eval_steps_per_second": 22.195, + "num_input_tokens_seen": 32381184, + "step": 35800 + }, + { + "epoch": 9.449782235713343, + "grad_norm": 9.155194129562005e-05, + "learning_rate": 0.008071938930514671, + "loss": 0.012, + "num_input_tokens_seen": 32385728, + "step": 35805 + }, + { + "epoch": 9.45110201926884, + "grad_norm": 0.0003730969037860632, + "learning_rate": 0.008052887072380726, + "loss": 0.013, + "num_input_tokens_seen": 32390464, + "step": 35810 + }, + { + "epoch": 9.452421802824336, + "grad_norm": 0.002852648962289095, + "learning_rate": 0.008033857104275437, + "loss": 0.0203, + "num_input_tokens_seen": 32394944, + "step": 35815 + }, + { + "epoch": 9.453741586379834, + "grad_norm": 0.003150331089273095, + "learning_rate": 0.008014849029133424, + "loss": 0.0143, + "num_input_tokens_seen": 32399488, + "step": 35820 + }, + { + "epoch": 9.455061369935331, + "grad_norm": 0.001466700923629105, + "learning_rate": 0.007995862849885975, + "loss": 0.0131, + "num_input_tokens_seen": 32404320, + "step": 35825 + }, + { + "epoch": 9.456381153490828, + "grad_norm": 0.0004503443487919867, + "learning_rate": 0.007976898569461032, + "loss": 0.0254, + "num_input_tokens_seen": 32408768, + "step": 35830 + }, + { + "epoch": 9.457700937046324, + "grad_norm": 0.0002745696110650897, + "learning_rate": 0.007957956190783088, + "loss": 0.0131, + "num_input_tokens_seen": 32413376, + "step": 35835 + }, + { + "epoch": 9.45902072060182, + "grad_norm": 0.0032690202351659536, + "learning_rate": 0.007939035716773324, + "loss": 0.0361, + "num_input_tokens_seen": 32417824, + "step": 35840 + }, + { + "epoch": 9.460340504157319, + "grad_norm": 0.002193956170231104, + "learning_rate": 0.007920137150349487, + "loss": 0.0075, + "num_input_tokens_seen": 32422336, + "step": 35845 + }, + { + "epoch": 9.461660287712816, + "grad_norm": 0.001252591609954834, + "learning_rate": 0.007901260494425981, + "loss": 0.0522, + "num_input_tokens_seen": 32426848, + "step": 35850 + }, + { + "epoch": 9.462980071268312, + "grad_norm": 0.001117808511480689, + "learning_rate": 0.007882405751913861, + "loss": 0.012, + "num_input_tokens_seen": 32431552, + "step": 35855 + }, + { + "epoch": 9.464299854823809, + "grad_norm": 0.0010225874138996005, + "learning_rate": 0.007863572925720702, + "loss": 0.0091, + "num_input_tokens_seen": 32436064, + "step": 35860 + }, + { + "epoch": 9.465619638379305, + "grad_norm": 0.0003756619698833674, + "learning_rate": 0.007844762018750827, + "loss": 0.0092, + "num_input_tokens_seen": 32440288, + "step": 35865 + }, + { + "epoch": 9.466939421934804, + "grad_norm": 0.004477737005800009, + "learning_rate": 0.007825973033905054, + "loss": 0.014, + "num_input_tokens_seen": 32444864, + "step": 35870 + }, + { + "epoch": 9.4682592054903, + "grad_norm": 0.0024340078234672546, + "learning_rate": 0.007807205974080927, + "loss": 0.0146, + "num_input_tokens_seen": 32448992, + "step": 35875 + }, + { + "epoch": 9.469578989045797, + "grad_norm": 0.00017990026390179992, + "learning_rate": 0.007788460842172551, + "loss": 0.0081, + "num_input_tokens_seen": 32453568, + "step": 35880 + }, + { + "epoch": 9.470898772601293, + "grad_norm": 0.0002465127035975456, + "learning_rate": 0.0077697376410706285, + "loss": 0.0086, + "num_input_tokens_seen": 32458464, + "step": 35885 + }, + { + "epoch": 9.47221855615679, + "grad_norm": 0.0016408670926466584, + "learning_rate": 0.007751036373662567, + "loss": 0.011, + "num_input_tokens_seen": 32462976, + "step": 35890 + }, + { + "epoch": 9.473538339712288, + "grad_norm": 0.0007563892868347466, + "learning_rate": 0.00773235704283231, + "loss": 0.0199, + "num_input_tokens_seen": 32467232, + "step": 35895 + }, + { + "epoch": 9.474858123267785, + "grad_norm": 0.0013689434854313731, + "learning_rate": 0.007713699651460437, + "loss": 0.0139, + "num_input_tokens_seen": 32471488, + "step": 35900 + }, + { + "epoch": 9.476177906823281, + "grad_norm": 0.0003437630657572299, + "learning_rate": 0.007695064202424162, + "loss": 0.0102, + "num_input_tokens_seen": 32475872, + "step": 35905 + }, + { + "epoch": 9.477497690378778, + "grad_norm": 0.0013193594058975577, + "learning_rate": 0.007676450698597286, + "loss": 0.0386, + "num_input_tokens_seen": 32480608, + "step": 35910 + }, + { + "epoch": 9.478817473934274, + "grad_norm": 0.0014229604275897145, + "learning_rate": 0.007657859142850265, + "loss": 0.0119, + "num_input_tokens_seen": 32485184, + "step": 35915 + }, + { + "epoch": 9.48013725748977, + "grad_norm": 1.5445433746208437e-05, + "learning_rate": 0.0076392895380501535, + "loss": 0.0114, + "num_input_tokens_seen": 32489664, + "step": 35920 + }, + { + "epoch": 9.48145704104527, + "grad_norm": 0.002897482831031084, + "learning_rate": 0.007620741887060611, + "loss": 0.013, + "num_input_tokens_seen": 32494016, + "step": 35925 + }, + { + "epoch": 9.482776824600766, + "grad_norm": 0.000628196052275598, + "learning_rate": 0.007602216192741901, + "loss": 0.009, + "num_input_tokens_seen": 32498400, + "step": 35930 + }, + { + "epoch": 9.484096608156262, + "grad_norm": 0.002173367654904723, + "learning_rate": 0.007583712457950969, + "loss": 0.0139, + "num_input_tokens_seen": 32503008, + "step": 35935 + }, + { + "epoch": 9.485416391711759, + "grad_norm": 0.0014616235857829452, + "learning_rate": 0.007565230685541269, + "loss": 0.009, + "num_input_tokens_seen": 32507424, + "step": 35940 + }, + { + "epoch": 9.486736175267255, + "grad_norm": 0.0010196615476161242, + "learning_rate": 0.007546770878362968, + "loss": 0.0159, + "num_input_tokens_seen": 32511744, + "step": 35945 + }, + { + "epoch": 9.488055958822754, + "grad_norm": 0.0002592610544525087, + "learning_rate": 0.0075283330392627405, + "loss": 0.0074, + "num_input_tokens_seen": 32516320, + "step": 35950 + }, + { + "epoch": 9.48937574237825, + "grad_norm": 0.001086884643882513, + "learning_rate": 0.007509917171083979, + "loss": 0.0109, + "num_input_tokens_seen": 32521056, + "step": 35955 + }, + { + "epoch": 9.490695525933747, + "grad_norm": 0.0016938666813075542, + "learning_rate": 0.007491523276666662, + "loss": 0.0563, + "num_input_tokens_seen": 32525536, + "step": 35960 + }, + { + "epoch": 9.492015309489243, + "grad_norm": 0.0033902290742844343, + "learning_rate": 0.007473151358847318, + "loss": 0.0101, + "num_input_tokens_seen": 32530144, + "step": 35965 + }, + { + "epoch": 9.49333509304474, + "grad_norm": 0.0008690988761372864, + "learning_rate": 0.007454801420459117, + "loss": 0.0045, + "num_input_tokens_seen": 32534752, + "step": 35970 + }, + { + "epoch": 9.494654876600238, + "grad_norm": 0.0018869140185415745, + "learning_rate": 0.0074364734643319105, + "loss": 0.0073, + "num_input_tokens_seen": 32539392, + "step": 35975 + }, + { + "epoch": 9.495974660155735, + "grad_norm": 0.0009559293976053596, + "learning_rate": 0.007418167493292022, + "loss": 0.012, + "num_input_tokens_seen": 32544096, + "step": 35980 + }, + { + "epoch": 9.497294443711231, + "grad_norm": 0.0007484970265068114, + "learning_rate": 0.0073998835101625245, + "loss": 0.0265, + "num_input_tokens_seen": 32548832, + "step": 35985 + }, + { + "epoch": 9.498614227266728, + "grad_norm": 0.0013105017133057117, + "learning_rate": 0.007381621517762998, + "loss": 0.0092, + "num_input_tokens_seen": 32553504, + "step": 35990 + }, + { + "epoch": 9.499934010822225, + "grad_norm": 0.0012802757555618882, + "learning_rate": 0.007363381518909689, + "loss": 0.0181, + "num_input_tokens_seen": 32558080, + "step": 35995 + }, + { + "epoch": 9.501253794377723, + "grad_norm": 0.0008037160732783377, + "learning_rate": 0.007345163516415448, + "loss": 0.008, + "num_input_tokens_seen": 32562688, + "step": 36000 + }, + { + "epoch": 9.501253794377723, + "eval_loss": 0.11009353399276733, + "eval_runtime": 76.0442, + "eval_samples_per_second": 88.567, + "eval_steps_per_second": 22.145, + "num_input_tokens_seen": 32562688, + "step": 36000 + }, + { + "epoch": 9.50257357793322, + "grad_norm": 0.0010966069530695677, + "learning_rate": 0.007326967513089693, + "loss": 0.0384, + "num_input_tokens_seen": 32567136, + "step": 36005 + }, + { + "epoch": 9.503893361488716, + "grad_norm": 0.0011357638286426663, + "learning_rate": 0.0073087935117384815, + "loss": 0.0155, + "num_input_tokens_seen": 32571936, + "step": 36010 + }, + { + "epoch": 9.505213145044213, + "grad_norm": 0.002032345626503229, + "learning_rate": 0.007290641515164503, + "loss": 0.0183, + "num_input_tokens_seen": 32576544, + "step": 36015 + }, + { + "epoch": 9.506532928599709, + "grad_norm": 0.0011899590026587248, + "learning_rate": 0.007272511526166986, + "loss": 0.0084, + "num_input_tokens_seen": 32581120, + "step": 36020 + }, + { + "epoch": 9.507852712155206, + "grad_norm": 0.0009561367915011942, + "learning_rate": 0.0072544035475418265, + "loss": 0.0117, + "num_input_tokens_seen": 32585664, + "step": 36025 + }, + { + "epoch": 9.509172495710704, + "grad_norm": 0.0023614042438566685, + "learning_rate": 0.007236317582081475, + "loss": 0.0117, + "num_input_tokens_seen": 32590336, + "step": 36030 + }, + { + "epoch": 9.5104922792662, + "grad_norm": 0.00010993362229783088, + "learning_rate": 0.007218253632575066, + "loss": 0.0051, + "num_input_tokens_seen": 32594784, + "step": 36035 + }, + { + "epoch": 9.511812062821697, + "grad_norm": 0.001355478074401617, + "learning_rate": 0.007200211701808223, + "loss": 0.0077, + "num_input_tokens_seen": 32599296, + "step": 36040 + }, + { + "epoch": 9.513131846377194, + "grad_norm": 0.0005556957912631333, + "learning_rate": 0.007182191792563286, + "loss": 0.0095, + "num_input_tokens_seen": 32603776, + "step": 36045 + }, + { + "epoch": 9.51445162993269, + "grad_norm": 0.007063083350658417, + "learning_rate": 0.0071641939076191145, + "loss": 0.0253, + "num_input_tokens_seen": 32608256, + "step": 36050 + }, + { + "epoch": 9.515771413488189, + "grad_norm": 0.0020860047079622746, + "learning_rate": 0.007146218049751257, + "loss": 0.0261, + "num_input_tokens_seen": 32612608, + "step": 36055 + }, + { + "epoch": 9.517091197043685, + "grad_norm": 0.004007165785878897, + "learning_rate": 0.0071282642217317775, + "loss": 0.026, + "num_input_tokens_seen": 32616992, + "step": 36060 + }, + { + "epoch": 9.518410980599182, + "grad_norm": 0.002949357032775879, + "learning_rate": 0.007110332426329396, + "loss": 0.0125, + "num_input_tokens_seen": 32621344, + "step": 36065 + }, + { + "epoch": 9.519730764154678, + "grad_norm": 0.0011237472062930465, + "learning_rate": 0.007092422666309417, + "loss": 0.0205, + "num_input_tokens_seen": 32625888, + "step": 36070 + }, + { + "epoch": 9.521050547710175, + "grad_norm": 0.001921768533065915, + "learning_rate": 0.0070745349444337295, + "loss": 0.0204, + "num_input_tokens_seen": 32630368, + "step": 36075 + }, + { + "epoch": 9.522370331265673, + "grad_norm": 0.003923740703612566, + "learning_rate": 0.007056669263460913, + "loss": 0.0208, + "num_input_tokens_seen": 32634976, + "step": 36080 + }, + { + "epoch": 9.52369011482117, + "grad_norm": 0.0006116019794717431, + "learning_rate": 0.007038825626145995, + "loss": 0.006, + "num_input_tokens_seen": 32639616, + "step": 36085 + }, + { + "epoch": 9.525009898376666, + "grad_norm": 0.006369997747242451, + "learning_rate": 0.007021004035240724, + "loss": 0.022, + "num_input_tokens_seen": 32644000, + "step": 36090 + }, + { + "epoch": 9.526329681932163, + "grad_norm": 0.000905327033251524, + "learning_rate": 0.007003204493493453, + "loss": 0.0149, + "num_input_tokens_seen": 32648256, + "step": 36095 + }, + { + "epoch": 9.52764946548766, + "grad_norm": 0.0002517233369871974, + "learning_rate": 0.006985427003649036, + "loss": 0.0065, + "num_input_tokens_seen": 32653120, + "step": 36100 + }, + { + "epoch": 9.528969249043158, + "grad_norm": 0.0019689835608005524, + "learning_rate": 0.006967671568449013, + "loss": 0.0306, + "num_input_tokens_seen": 32657472, + "step": 36105 + }, + { + "epoch": 9.530289032598654, + "grad_norm": 0.0029368982650339603, + "learning_rate": 0.006949938190631511, + "loss": 0.0227, + "num_input_tokens_seen": 32661952, + "step": 36110 + }, + { + "epoch": 9.53160881615415, + "grad_norm": 0.0006895380211062729, + "learning_rate": 0.0069322268729311905, + "loss": 0.0102, + "num_input_tokens_seen": 32666336, + "step": 36115 + }, + { + "epoch": 9.532928599709647, + "grad_norm": 0.0005594383110292256, + "learning_rate": 0.006914537618079403, + "loss": 0.0142, + "num_input_tokens_seen": 32670752, + "step": 36120 + }, + { + "epoch": 9.534248383265144, + "grad_norm": 0.0014855369227007031, + "learning_rate": 0.006896870428804031, + "loss": 0.0166, + "num_input_tokens_seen": 32675424, + "step": 36125 + }, + { + "epoch": 9.535568166820642, + "grad_norm": 0.0007667699828743935, + "learning_rate": 0.006879225307829595, + "loss": 0.0092, + "num_input_tokens_seen": 32680096, + "step": 36130 + }, + { + "epoch": 9.536887950376139, + "grad_norm": 0.0006803603610023856, + "learning_rate": 0.00686160225787717, + "loss": 0.0056, + "num_input_tokens_seen": 32684576, + "step": 36135 + }, + { + "epoch": 9.538207733931635, + "grad_norm": 0.0003709982556756586, + "learning_rate": 0.006844001281664463, + "loss": 0.0157, + "num_input_tokens_seen": 32689216, + "step": 36140 + }, + { + "epoch": 9.539527517487132, + "grad_norm": 0.0020050122402608395, + "learning_rate": 0.006826422381905789, + "loss": 0.0135, + "num_input_tokens_seen": 32693920, + "step": 36145 + }, + { + "epoch": 9.540847301042628, + "grad_norm": 0.0007515945471823215, + "learning_rate": 0.006808865561311994, + "loss": 0.0098, + "num_input_tokens_seen": 32698496, + "step": 36150 + }, + { + "epoch": 9.542167084598127, + "grad_norm": 0.002909444272518158, + "learning_rate": 0.00679133082259058, + "loss": 0.0134, + "num_input_tokens_seen": 32702944, + "step": 36155 + }, + { + "epoch": 9.543486868153623, + "grad_norm": 0.0006093999836593866, + "learning_rate": 0.00677381816844565, + "loss": 0.0095, + "num_input_tokens_seen": 32707360, + "step": 36160 + }, + { + "epoch": 9.54480665170912, + "grad_norm": 0.0033923587761819363, + "learning_rate": 0.0067563276015778434, + "loss": 0.019, + "num_input_tokens_seen": 32712096, + "step": 36165 + }, + { + "epoch": 9.546126435264616, + "grad_norm": 0.005724167916923761, + "learning_rate": 0.006738859124684437, + "loss": 0.0382, + "num_input_tokens_seen": 32716448, + "step": 36170 + }, + { + "epoch": 9.547446218820113, + "grad_norm": 0.0004976424388587475, + "learning_rate": 0.006721412740459259, + "loss": 0.0115, + "num_input_tokens_seen": 32720992, + "step": 36175 + }, + { + "epoch": 9.54876600237561, + "grad_norm": 0.0032279810402542353, + "learning_rate": 0.006703988451592824, + "loss": 0.018, + "num_input_tokens_seen": 32725312, + "step": 36180 + }, + { + "epoch": 9.550085785931108, + "grad_norm": 0.007885074242949486, + "learning_rate": 0.006686586260772114, + "loss": 0.0191, + "num_input_tokens_seen": 32729792, + "step": 36185 + }, + { + "epoch": 9.551405569486604, + "grad_norm": 0.001425020513124764, + "learning_rate": 0.006669206170680819, + "loss": 0.0318, + "num_input_tokens_seen": 32734496, + "step": 36190 + }, + { + "epoch": 9.552725353042101, + "grad_norm": 0.003770294366404414, + "learning_rate": 0.0066518481839991095, + "loss": 0.0364, + "num_input_tokens_seen": 32738976, + "step": 36195 + }, + { + "epoch": 9.554045136597598, + "grad_norm": 0.0009358838433399796, + "learning_rate": 0.006634512303403861, + "loss": 0.0194, + "num_input_tokens_seen": 32743456, + "step": 36200 + }, + { + "epoch": 9.554045136597598, + "eval_loss": 0.1113118901848793, + "eval_runtime": 75.6803, + "eval_samples_per_second": 88.993, + "eval_steps_per_second": 22.251, + "num_input_tokens_seen": 32743456, + "step": 36200 + }, + { + "epoch": 9.555364920153094, + "grad_norm": 0.004647703841328621, + "learning_rate": 0.0066171985315684355, + "loss": 0.0246, + "num_input_tokens_seen": 32748032, + "step": 36205 + }, + { + "epoch": 9.556684703708592, + "grad_norm": 0.002468879334628582, + "learning_rate": 0.0065999068711628806, + "loss": 0.0038, + "num_input_tokens_seen": 32752704, + "step": 36210 + }, + { + "epoch": 9.558004487264089, + "grad_norm": 0.002070071641355753, + "learning_rate": 0.0065826373248537295, + "loss": 0.0061, + "num_input_tokens_seen": 32757024, + "step": 36215 + }, + { + "epoch": 9.559324270819586, + "grad_norm": 0.0015617110766470432, + "learning_rate": 0.006565389895304218, + "loss": 0.0255, + "num_input_tokens_seen": 32761536, + "step": 36220 + }, + { + "epoch": 9.560644054375082, + "grad_norm": 0.0008909569005481899, + "learning_rate": 0.006548164585174104, + "loss": 0.0085, + "num_input_tokens_seen": 32765856, + "step": 36225 + }, + { + "epoch": 9.561963837930579, + "grad_norm": 0.001164063811302185, + "learning_rate": 0.006530961397119728, + "loss": 0.0135, + "num_input_tokens_seen": 32770336, + "step": 36230 + }, + { + "epoch": 9.563283621486077, + "grad_norm": 0.00015529090887866914, + "learning_rate": 0.00651378033379405, + "loss": 0.0071, + "num_input_tokens_seen": 32774816, + "step": 36235 + }, + { + "epoch": 9.564603405041574, + "grad_norm": 0.001180913532152772, + "learning_rate": 0.006496621397846619, + "loss": 0.0082, + "num_input_tokens_seen": 32779296, + "step": 36240 + }, + { + "epoch": 9.56592318859707, + "grad_norm": 0.0004790840030182153, + "learning_rate": 0.006479484591923518, + "loss": 0.0072, + "num_input_tokens_seen": 32783904, + "step": 36245 + }, + { + "epoch": 9.567242972152567, + "grad_norm": 0.002714331028982997, + "learning_rate": 0.006462369918667515, + "loss": 0.0195, + "num_input_tokens_seen": 32788416, + "step": 36250 + }, + { + "epoch": 9.568562755708063, + "grad_norm": 0.0014434057520702481, + "learning_rate": 0.006445277380717851, + "loss": 0.017, + "num_input_tokens_seen": 32793120, + "step": 36255 + }, + { + "epoch": 9.569882539263562, + "grad_norm": 0.001360514317639172, + "learning_rate": 0.006428206980710466, + "loss": 0.0105, + "num_input_tokens_seen": 32798016, + "step": 36260 + }, + { + "epoch": 9.571202322819058, + "grad_norm": 0.006308207754045725, + "learning_rate": 0.006411158721277788, + "loss": 0.0153, + "num_input_tokens_seen": 32802592, + "step": 36265 + }, + { + "epoch": 9.572522106374555, + "grad_norm": 0.009256291203200817, + "learning_rate": 0.00639413260504888, + "loss": 0.0298, + "num_input_tokens_seen": 32806912, + "step": 36270 + }, + { + "epoch": 9.573841889930051, + "grad_norm": 0.0018290923908352852, + "learning_rate": 0.006377128634649376, + "loss": 0.0076, + "num_input_tokens_seen": 32811520, + "step": 36275 + }, + { + "epoch": 9.575161673485548, + "grad_norm": 0.0016831712564453483, + "learning_rate": 0.006360146812701528, + "loss": 0.0284, + "num_input_tokens_seen": 32815968, + "step": 36280 + }, + { + "epoch": 9.576481457041044, + "grad_norm": 0.0016918792389333248, + "learning_rate": 0.006343187141824125, + "loss": 0.0054, + "num_input_tokens_seen": 32821056, + "step": 36285 + }, + { + "epoch": 9.577801240596543, + "grad_norm": 0.001968757715076208, + "learning_rate": 0.00632624962463259, + "loss": 0.008, + "num_input_tokens_seen": 32825600, + "step": 36290 + }, + { + "epoch": 9.57912102415204, + "grad_norm": 0.0026150827761739492, + "learning_rate": 0.006309334263738853, + "loss": 0.0262, + "num_input_tokens_seen": 32830304, + "step": 36295 + }, + { + "epoch": 9.580440807707536, + "grad_norm": 0.00025969516718760133, + "learning_rate": 0.006292441061751508, + "loss": 0.0072, + "num_input_tokens_seen": 32834720, + "step": 36300 + }, + { + "epoch": 9.581760591263032, + "grad_norm": 0.0006462701712734997, + "learning_rate": 0.0062755700212757054, + "loss": 0.0117, + "num_input_tokens_seen": 32839328, + "step": 36305 + }, + { + "epoch": 9.583080374818529, + "grad_norm": 0.004062538500875235, + "learning_rate": 0.006258721144913148, + "loss": 0.0165, + "num_input_tokens_seen": 32844096, + "step": 36310 + }, + { + "epoch": 9.584400158374027, + "grad_norm": 0.0008480579708702862, + "learning_rate": 0.0062418944352621575, + "loss": 0.0122, + "num_input_tokens_seen": 32849024, + "step": 36315 + }, + { + "epoch": 9.585719941929524, + "grad_norm": 0.0014934962382540107, + "learning_rate": 0.0062250898949176405, + "loss": 0.0136, + "num_input_tokens_seen": 32853664, + "step": 36320 + }, + { + "epoch": 9.58703972548502, + "grad_norm": 0.0009719459339976311, + "learning_rate": 0.006208307526471041, + "loss": 0.0511, + "num_input_tokens_seen": 32857984, + "step": 36325 + }, + { + "epoch": 9.588359509040517, + "grad_norm": 0.0034679214004427195, + "learning_rate": 0.006191547332510405, + "loss": 0.0101, + "num_input_tokens_seen": 32862624, + "step": 36330 + }, + { + "epoch": 9.589679292596013, + "grad_norm": 0.001549924723803997, + "learning_rate": 0.006174809315620416, + "loss": 0.0306, + "num_input_tokens_seen": 32866976, + "step": 36335 + }, + { + "epoch": 9.590999076151512, + "grad_norm": 0.0003782899002544582, + "learning_rate": 0.00615809347838221, + "loss": 0.0213, + "num_input_tokens_seen": 32871680, + "step": 36340 + }, + { + "epoch": 9.592318859707008, + "grad_norm": 0.0009640148491598666, + "learning_rate": 0.006141399823373655, + "loss": 0.0454, + "num_input_tokens_seen": 32876192, + "step": 36345 + }, + { + "epoch": 9.593638643262505, + "grad_norm": 0.0009726291173137724, + "learning_rate": 0.0061247283531690455, + "loss": 0.0067, + "num_input_tokens_seen": 32880832, + "step": 36350 + }, + { + "epoch": 9.594958426818001, + "grad_norm": 0.003018260933458805, + "learning_rate": 0.0061080790703393895, + "loss": 0.017, + "num_input_tokens_seen": 32885248, + "step": 36355 + }, + { + "epoch": 9.596278210373498, + "grad_norm": 0.002045594621449709, + "learning_rate": 0.006091451977452217, + "loss": 0.0067, + "num_input_tokens_seen": 32889696, + "step": 36360 + }, + { + "epoch": 9.597597993928996, + "grad_norm": 0.0007965418044477701, + "learning_rate": 0.00607484707707161, + "loss": 0.0082, + "num_input_tokens_seen": 32893984, + "step": 36365 + }, + { + "epoch": 9.598917777484493, + "grad_norm": 0.0032780487090349197, + "learning_rate": 0.006058264371758254, + "loss": 0.0102, + "num_input_tokens_seen": 32898880, + "step": 36370 + }, + { + "epoch": 9.60023756103999, + "grad_norm": 0.0010063006775453687, + "learning_rate": 0.00604170386406942, + "loss": 0.0102, + "num_input_tokens_seen": 32903296, + "step": 36375 + }, + { + "epoch": 9.601557344595486, + "grad_norm": 0.0014475072966888547, + "learning_rate": 0.006025165556558931, + "loss": 0.0108, + "num_input_tokens_seen": 32908064, + "step": 36380 + }, + { + "epoch": 9.602877128150983, + "grad_norm": 0.006683215498924255, + "learning_rate": 0.006008649451777248, + "loss": 0.0513, + "num_input_tokens_seen": 32912576, + "step": 36385 + }, + { + "epoch": 9.604196911706481, + "grad_norm": 0.0008068166789598763, + "learning_rate": 0.005992155552271283, + "loss": 0.0064, + "num_input_tokens_seen": 32917408, + "step": 36390 + }, + { + "epoch": 9.605516695261977, + "grad_norm": 0.00040945675573311746, + "learning_rate": 0.005975683860584685, + "loss": 0.0063, + "num_input_tokens_seen": 32922080, + "step": 36395 + }, + { + "epoch": 9.606836478817474, + "grad_norm": 0.006525733973830938, + "learning_rate": 0.0059592343792575385, + "loss": 0.0371, + "num_input_tokens_seen": 32926592, + "step": 36400 + }, + { + "epoch": 9.606836478817474, + "eval_loss": 0.11161885410547256, + "eval_runtime": 75.8825, + "eval_samples_per_second": 88.756, + "eval_steps_per_second": 22.192, + "num_input_tokens_seen": 32926592, + "step": 36400 + }, + { + "epoch": 9.60815626237297, + "grad_norm": 0.005191821604967117, + "learning_rate": 0.0059428071108265975, + "loss": 0.0371, + "num_input_tokens_seen": 32931328, + "step": 36405 + }, + { + "epoch": 9.609476045928467, + "grad_norm": 0.0014634609688073397, + "learning_rate": 0.005926402057825136, + "loss": 0.0184, + "num_input_tokens_seen": 32935744, + "step": 36410 + }, + { + "epoch": 9.610795829483965, + "grad_norm": 0.00026561596314422786, + "learning_rate": 0.005910019222782997, + "loss": 0.0043, + "num_input_tokens_seen": 32940064, + "step": 36415 + }, + { + "epoch": 9.612115613039462, + "grad_norm": 0.0011378629133105278, + "learning_rate": 0.005893658608226643, + "loss": 0.0244, + "num_input_tokens_seen": 32944544, + "step": 36420 + }, + { + "epoch": 9.613435396594959, + "grad_norm": 0.0017023002728819847, + "learning_rate": 0.0058773202166791045, + "loss": 0.0046, + "num_input_tokens_seen": 32949024, + "step": 36425 + }, + { + "epoch": 9.614755180150455, + "grad_norm": 0.0034842388704419136, + "learning_rate": 0.005861004050659918, + "loss": 0.0127, + "num_input_tokens_seen": 32953280, + "step": 36430 + }, + { + "epoch": 9.616074963705952, + "grad_norm": 0.0020254841074347496, + "learning_rate": 0.005844710112685286, + "loss": 0.0632, + "num_input_tokens_seen": 32957952, + "step": 36435 + }, + { + "epoch": 9.61739474726145, + "grad_norm": 0.004186123609542847, + "learning_rate": 0.005828438405267933, + "loss": 0.0143, + "num_input_tokens_seen": 32962144, + "step": 36440 + }, + { + "epoch": 9.618714530816947, + "grad_norm": 0.005651433952152729, + "learning_rate": 0.00581218893091715, + "loss": 0.0181, + "num_input_tokens_seen": 32966560, + "step": 36445 + }, + { + "epoch": 9.620034314372443, + "grad_norm": 0.0003107022785115987, + "learning_rate": 0.005795961692138801, + "loss": 0.0107, + "num_input_tokens_seen": 32970944, + "step": 36450 + }, + { + "epoch": 9.62135409792794, + "grad_norm": 0.0010501168435439467, + "learning_rate": 0.00577975669143535, + "loss": 0.0117, + "num_input_tokens_seen": 32975584, + "step": 36455 + }, + { + "epoch": 9.622673881483436, + "grad_norm": 0.000550813740119338, + "learning_rate": 0.005763573931305782, + "loss": 0.0079, + "num_input_tokens_seen": 32980192, + "step": 36460 + }, + { + "epoch": 9.623993665038933, + "grad_norm": 0.003968816716223955, + "learning_rate": 0.005747413414245733, + "loss": 0.0322, + "num_input_tokens_seen": 32984864, + "step": 36465 + }, + { + "epoch": 9.625313448594431, + "grad_norm": 0.0008734037983231246, + "learning_rate": 0.005731275142747294, + "loss": 0.0234, + "num_input_tokens_seen": 32989248, + "step": 36470 + }, + { + "epoch": 9.626633232149928, + "grad_norm": 0.0006112412665970623, + "learning_rate": 0.005715159119299256, + "loss": 0.012, + "num_input_tokens_seen": 32993632, + "step": 36475 + }, + { + "epoch": 9.627953015705424, + "grad_norm": 0.006972688715904951, + "learning_rate": 0.005699065346386867, + "loss": 0.0328, + "num_input_tokens_seen": 32998080, + "step": 36480 + }, + { + "epoch": 9.62927279926092, + "grad_norm": 0.0023454083129763603, + "learning_rate": 0.0056829938264919885, + "loss": 0.0187, + "num_input_tokens_seen": 33002464, + "step": 36485 + }, + { + "epoch": 9.630592582816417, + "grad_norm": 0.00040545177762396634, + "learning_rate": 0.005666944562093074, + "loss": 0.0143, + "num_input_tokens_seen": 33007104, + "step": 36490 + }, + { + "epoch": 9.631912366371916, + "grad_norm": 0.0013907080283388495, + "learning_rate": 0.005650917555665108, + "loss": 0.0216, + "num_input_tokens_seen": 33011552, + "step": 36495 + }, + { + "epoch": 9.633232149927412, + "grad_norm": 0.00033040213747881353, + "learning_rate": 0.005634912809679632, + "loss": 0.0246, + "num_input_tokens_seen": 33015968, + "step": 36500 + }, + { + "epoch": 9.634551933482909, + "grad_norm": 0.0017594121163710952, + "learning_rate": 0.005618930326604854, + "loss": 0.0094, + "num_input_tokens_seen": 33020480, + "step": 36505 + }, + { + "epoch": 9.635871717038405, + "grad_norm": 0.002159327268600464, + "learning_rate": 0.005602970108905386, + "loss": 0.0146, + "num_input_tokens_seen": 33024832, + "step": 36510 + }, + { + "epoch": 9.637191500593902, + "grad_norm": 0.000764794647693634, + "learning_rate": 0.005587032159042543, + "loss": 0.0267, + "num_input_tokens_seen": 33029472, + "step": 36515 + }, + { + "epoch": 9.6385112841494, + "grad_norm": 0.0008952699135988951, + "learning_rate": 0.005571116479474158, + "loss": 0.0074, + "num_input_tokens_seen": 33034048, + "step": 36520 + }, + { + "epoch": 9.639831067704897, + "grad_norm": 0.0010619938839226961, + "learning_rate": 0.005555223072654619, + "loss": 0.0117, + "num_input_tokens_seen": 33038496, + "step": 36525 + }, + { + "epoch": 9.641150851260393, + "grad_norm": 0.002415383467450738, + "learning_rate": 0.005539351941034881, + "loss": 0.0157, + "num_input_tokens_seen": 33043264, + "step": 36530 + }, + { + "epoch": 9.64247063481589, + "grad_norm": 0.0003829746856354177, + "learning_rate": 0.0055235030870624865, + "loss": 0.0077, + "num_input_tokens_seen": 33048032, + "step": 36535 + }, + { + "epoch": 9.643790418371387, + "grad_norm": 0.004826701711863279, + "learning_rate": 0.005507676513181514, + "loss": 0.0384, + "num_input_tokens_seen": 33052544, + "step": 36540 + }, + { + "epoch": 9.645110201926883, + "grad_norm": 0.0018963977927342057, + "learning_rate": 0.005491872221832628, + "loss": 0.0102, + "num_input_tokens_seen": 33057056, + "step": 36545 + }, + { + "epoch": 9.646429985482381, + "grad_norm": 0.0008526855963282287, + "learning_rate": 0.005476090215453061, + "loss": 0.0108, + "num_input_tokens_seen": 33061824, + "step": 36550 + }, + { + "epoch": 9.647749769037878, + "grad_norm": 0.00016097472689580172, + "learning_rate": 0.0054603304964765675, + "loss": 0.0039, + "num_input_tokens_seen": 33066304, + "step": 36555 + }, + { + "epoch": 9.649069552593375, + "grad_norm": 0.002229384146630764, + "learning_rate": 0.005444593067333519, + "loss": 0.0167, + "num_input_tokens_seen": 33070784, + "step": 36560 + }, + { + "epoch": 9.650389336148871, + "grad_norm": 0.0022203559055924416, + "learning_rate": 0.00542887793045081, + "loss": 0.0167, + "num_input_tokens_seen": 33075072, + "step": 36565 + }, + { + "epoch": 9.651709119704368, + "grad_norm": 0.0011114944936707616, + "learning_rate": 0.005413185088251932, + "loss": 0.0075, + "num_input_tokens_seen": 33079424, + "step": 36570 + }, + { + "epoch": 9.653028903259866, + "grad_norm": 0.0016205586725845933, + "learning_rate": 0.005397514543156884, + "loss": 0.0242, + "num_input_tokens_seen": 33083776, + "step": 36575 + }, + { + "epoch": 9.654348686815363, + "grad_norm": 0.0054969703778624535, + "learning_rate": 0.0053818662975822825, + "loss": 0.011, + "num_input_tokens_seen": 33088416, + "step": 36580 + }, + { + "epoch": 9.655668470370859, + "grad_norm": 0.0007466125534847379, + "learning_rate": 0.005366240353941315, + "loss": 0.0131, + "num_input_tokens_seen": 33092832, + "step": 36585 + }, + { + "epoch": 9.656988253926356, + "grad_norm": 0.0001546952553326264, + "learning_rate": 0.005350636714643636, + "loss": 0.005, + "num_input_tokens_seen": 33097024, + "step": 36590 + }, + { + "epoch": 9.658308037481852, + "grad_norm": 0.003675947431474924, + "learning_rate": 0.005335055382095555, + "loss": 0.0281, + "num_input_tokens_seen": 33101312, + "step": 36595 + }, + { + "epoch": 9.65962782103735, + "grad_norm": 0.003591841785237193, + "learning_rate": 0.005319496358699915, + "loss": 0.0185, + "num_input_tokens_seen": 33105696, + "step": 36600 + }, + { + "epoch": 9.65962782103735, + "eval_loss": 0.11173578351736069, + "eval_runtime": 75.8879, + "eval_samples_per_second": 88.749, + "eval_steps_per_second": 22.191, + "num_input_tokens_seen": 33105696, + "step": 36600 + }, + { + "epoch": 9.660947604592847, + "grad_norm": 0.004403884522616863, + "learning_rate": 0.005303959646856099, + "loss": 0.0222, + "num_input_tokens_seen": 33110464, + "step": 36605 + }, + { + "epoch": 9.662267388148344, + "grad_norm": 0.000445193873019889, + "learning_rate": 0.005288445248960089, + "loss": 0.0084, + "num_input_tokens_seen": 33114688, + "step": 36610 + }, + { + "epoch": 9.66358717170384, + "grad_norm": 0.009739452973008156, + "learning_rate": 0.005272953167404354, + "loss": 0.0253, + "num_input_tokens_seen": 33119584, + "step": 36615 + }, + { + "epoch": 9.664906955259337, + "grad_norm": 0.0006706370622850955, + "learning_rate": 0.005257483404578017, + "loss": 0.0113, + "num_input_tokens_seen": 33124192, + "step": 36620 + }, + { + "epoch": 9.666226738814835, + "grad_norm": 0.000990161206573248, + "learning_rate": 0.0052420359628666865, + "loss": 0.0338, + "num_input_tokens_seen": 33128640, + "step": 36625 + }, + { + "epoch": 9.667546522370332, + "grad_norm": 0.0017074857605621219, + "learning_rate": 0.00522661084465254, + "loss": 0.0111, + "num_input_tokens_seen": 33133248, + "step": 36630 + }, + { + "epoch": 9.668866305925828, + "grad_norm": 0.00046369980555027723, + "learning_rate": 0.005211208052314326, + "loss": 0.0155, + "num_input_tokens_seen": 33137824, + "step": 36635 + }, + { + "epoch": 9.670186089481325, + "grad_norm": 0.0018976128194481134, + "learning_rate": 0.005195827588227391, + "loss": 0.0062, + "num_input_tokens_seen": 33142368, + "step": 36640 + }, + { + "epoch": 9.671505873036821, + "grad_norm": 0.00019597867503762245, + "learning_rate": 0.0051804694547635255, + "loss": 0.0054, + "num_input_tokens_seen": 33146560, + "step": 36645 + }, + { + "epoch": 9.67282565659232, + "grad_norm": 0.0003149389522150159, + "learning_rate": 0.005165133654291232, + "loss": 0.0151, + "num_input_tokens_seen": 33150880, + "step": 36650 + }, + { + "epoch": 9.674145440147816, + "grad_norm": 0.00011787186667788774, + "learning_rate": 0.005149820189175402, + "loss": 0.0108, + "num_input_tokens_seen": 33155488, + "step": 36655 + }, + { + "epoch": 9.675465223703313, + "grad_norm": 0.0007996238418854773, + "learning_rate": 0.005134529061777598, + "loss": 0.0052, + "num_input_tokens_seen": 33159904, + "step": 36660 + }, + { + "epoch": 9.67678500725881, + "grad_norm": 0.00025603416725061834, + "learning_rate": 0.005119260274455933, + "loss": 0.0268, + "num_input_tokens_seen": 33164160, + "step": 36665 + }, + { + "epoch": 9.678104790814306, + "grad_norm": 0.0005116892280057073, + "learning_rate": 0.005104013829565007, + "loss": 0.0127, + "num_input_tokens_seen": 33168672, + "step": 36670 + }, + { + "epoch": 9.679424574369804, + "grad_norm": 0.00047853286378085613, + "learning_rate": 0.005088789729456006, + "loss": 0.0078, + "num_input_tokens_seen": 33173024, + "step": 36675 + }, + { + "epoch": 9.6807443579253, + "grad_norm": 0.007074594963341951, + "learning_rate": 0.005073587976476735, + "loss": 0.0363, + "num_input_tokens_seen": 33177664, + "step": 36680 + }, + { + "epoch": 9.682064141480797, + "grad_norm": 0.000292989076115191, + "learning_rate": 0.005058408572971418, + "loss": 0.0214, + "num_input_tokens_seen": 33182528, + "step": 36685 + }, + { + "epoch": 9.683383925036294, + "grad_norm": 0.0005234499694779515, + "learning_rate": 0.005043251521280983, + "loss": 0.0079, + "num_input_tokens_seen": 33187232, + "step": 36690 + }, + { + "epoch": 9.68470370859179, + "grad_norm": 0.00020704652706626803, + "learning_rate": 0.005028116823742795, + "loss": 0.0182, + "num_input_tokens_seen": 33192032, + "step": 36695 + }, + { + "epoch": 9.686023492147289, + "grad_norm": 0.004330973606556654, + "learning_rate": 0.005013004482690819, + "loss": 0.0193, + "num_input_tokens_seen": 33196704, + "step": 36700 + }, + { + "epoch": 9.687343275702785, + "grad_norm": 0.0018897149711847305, + "learning_rate": 0.0049979145004555746, + "loss": 0.033, + "num_input_tokens_seen": 33201184, + "step": 36705 + }, + { + "epoch": 9.688663059258282, + "grad_norm": 0.0012126858346164227, + "learning_rate": 0.004982846879364116, + "loss": 0.0202, + "num_input_tokens_seen": 33205312, + "step": 36710 + }, + { + "epoch": 9.689982842813778, + "grad_norm": 0.0024423471186310053, + "learning_rate": 0.0049678016217400535, + "loss": 0.0302, + "num_input_tokens_seen": 33209760, + "step": 36715 + }, + { + "epoch": 9.691302626369275, + "grad_norm": 0.001052108476869762, + "learning_rate": 0.004952778729903595, + "loss": 0.0168, + "num_input_tokens_seen": 33214432, + "step": 36720 + }, + { + "epoch": 9.692622409924772, + "grad_norm": 0.001318443100899458, + "learning_rate": 0.004937778206171422, + "loss": 0.0101, + "num_input_tokens_seen": 33218880, + "step": 36725 + }, + { + "epoch": 9.69394219348027, + "grad_norm": 0.00016806238272693008, + "learning_rate": 0.004922800052856835, + "loss": 0.0175, + "num_input_tokens_seen": 33223584, + "step": 36730 + }, + { + "epoch": 9.695261977035766, + "grad_norm": 0.002962312661111355, + "learning_rate": 0.004907844272269602, + "loss": 0.01, + "num_input_tokens_seen": 33228032, + "step": 36735 + }, + { + "epoch": 9.696581760591263, + "grad_norm": 0.0012533159460872412, + "learning_rate": 0.004892910866716144, + "loss": 0.0124, + "num_input_tokens_seen": 33232192, + "step": 36740 + }, + { + "epoch": 9.69790154414676, + "grad_norm": 0.0005152537487447262, + "learning_rate": 0.004877999838499369, + "loss": 0.0122, + "num_input_tokens_seen": 33237088, + "step": 36745 + }, + { + "epoch": 9.699221327702256, + "grad_norm": 0.004541686270385981, + "learning_rate": 0.0048631111899187065, + "loss": 0.0193, + "num_input_tokens_seen": 33241760, + "step": 36750 + }, + { + "epoch": 9.700541111257754, + "grad_norm": 0.002145425882190466, + "learning_rate": 0.0048482449232702335, + "loss": 0.0181, + "num_input_tokens_seen": 33246240, + "step": 36755 + }, + { + "epoch": 9.701860894813251, + "grad_norm": 0.0037140739150345325, + "learning_rate": 0.004833401040846469, + "loss": 0.0098, + "num_input_tokens_seen": 33250624, + "step": 36760 + }, + { + "epoch": 9.703180678368748, + "grad_norm": 0.004169657826423645, + "learning_rate": 0.004818579544936546, + "loss": 0.0156, + "num_input_tokens_seen": 33255200, + "step": 36765 + }, + { + "epoch": 9.704500461924244, + "grad_norm": 0.001681805937550962, + "learning_rate": 0.004803780437826121, + "loss": 0.0075, + "num_input_tokens_seen": 33259520, + "step": 36770 + }, + { + "epoch": 9.70582024547974, + "grad_norm": 0.00127736188005656, + "learning_rate": 0.004789003721797402, + "loss": 0.008, + "num_input_tokens_seen": 33263840, + "step": 36775 + }, + { + "epoch": 9.707140029035239, + "grad_norm": 0.0005982627626508474, + "learning_rate": 0.004774249399129132, + "loss": 0.0227, + "num_input_tokens_seen": 33268512, + "step": 36780 + }, + { + "epoch": 9.708459812590736, + "grad_norm": 0.003994709346443415, + "learning_rate": 0.004759517472096642, + "loss": 0.0119, + "num_input_tokens_seen": 33272928, + "step": 36785 + }, + { + "epoch": 9.709779596146232, + "grad_norm": 0.00258527765981853, + "learning_rate": 0.004744807942971746, + "loss": 0.0104, + "num_input_tokens_seen": 33277600, + "step": 36790 + }, + { + "epoch": 9.711099379701729, + "grad_norm": 0.000599072955083102, + "learning_rate": 0.004730120814022881, + "loss": 0.0073, + "num_input_tokens_seen": 33282272, + "step": 36795 + }, + { + "epoch": 9.712419163257225, + "grad_norm": 0.0035027945414185524, + "learning_rate": 0.004715456087514935, + "loss": 0.0174, + "num_input_tokens_seen": 33286560, + "step": 36800 + }, + { + "epoch": 9.712419163257225, + "eval_loss": 0.11154687404632568, + "eval_runtime": 75.9088, + "eval_samples_per_second": 88.725, + "eval_steps_per_second": 22.185, + "num_input_tokens_seen": 33286560, + "step": 36800 + }, + { + "epoch": 9.713738946812724, + "grad_norm": 0.0010248265461996198, + "learning_rate": 0.004700813765709432, + "loss": 0.0109, + "num_input_tokens_seen": 33291328, + "step": 36805 + }, + { + "epoch": 9.71505873036822, + "grad_norm": 7.588506559841335e-05, + "learning_rate": 0.004686193850864401, + "loss": 0.0065, + "num_input_tokens_seen": 33295808, + "step": 36810 + }, + { + "epoch": 9.716378513923717, + "grad_norm": 0.00032428346457891166, + "learning_rate": 0.004671596345234385, + "loss": 0.0284, + "num_input_tokens_seen": 33300224, + "step": 36815 + }, + { + "epoch": 9.717698297479213, + "grad_norm": 0.004471346270292997, + "learning_rate": 0.00465702125107052, + "loss": 0.0134, + "num_input_tokens_seen": 33304928, + "step": 36820 + }, + { + "epoch": 9.71901808103471, + "grad_norm": 7.276333053596318e-05, + "learning_rate": 0.004642468570620506, + "loss": 0.0139, + "num_input_tokens_seen": 33309408, + "step": 36825 + }, + { + "epoch": 9.720337864590206, + "grad_norm": 0.0004698260163422674, + "learning_rate": 0.004627938306128482, + "loss": 0.072, + "num_input_tokens_seen": 33314144, + "step": 36830 + }, + { + "epoch": 9.721657648145705, + "grad_norm": 0.0016975858015939593, + "learning_rate": 0.004613430459835255, + "loss": 0.0079, + "num_input_tokens_seen": 33318656, + "step": 36835 + }, + { + "epoch": 9.722977431701201, + "grad_norm": 0.00017938052769750357, + "learning_rate": 0.004598945033978085, + "loss": 0.0031, + "num_input_tokens_seen": 33323040, + "step": 36840 + }, + { + "epoch": 9.724297215256698, + "grad_norm": 0.0011936475057154894, + "learning_rate": 0.004584482030790804, + "loss": 0.0291, + "num_input_tokens_seen": 33327744, + "step": 36845 + }, + { + "epoch": 9.725616998812194, + "grad_norm": 0.0005810860893689096, + "learning_rate": 0.004570041452503826, + "loss": 0.0133, + "num_input_tokens_seen": 33332128, + "step": 36850 + }, + { + "epoch": 9.72693678236769, + "grad_norm": 0.0002798457571770996, + "learning_rate": 0.004555623301344003, + "loss": 0.029, + "num_input_tokens_seen": 33336672, + "step": 36855 + }, + { + "epoch": 9.72825656592319, + "grad_norm": 0.00038330481038428843, + "learning_rate": 0.004541227579534857, + "loss": 0.0117, + "num_input_tokens_seen": 33341248, + "step": 36860 + }, + { + "epoch": 9.729576349478686, + "grad_norm": 9.32980838115327e-05, + "learning_rate": 0.004526854289296378, + "loss": 0.027, + "num_input_tokens_seen": 33345728, + "step": 36865 + }, + { + "epoch": 9.730896133034182, + "grad_norm": 0.0007535468903370202, + "learning_rate": 0.004512503432845078, + "loss": 0.0225, + "num_input_tokens_seen": 33350016, + "step": 36870 + }, + { + "epoch": 9.732215916589679, + "grad_norm": 0.0002912606578320265, + "learning_rate": 0.004498175012394068, + "loss": 0.0111, + "num_input_tokens_seen": 33354496, + "step": 36875 + }, + { + "epoch": 9.733535700145175, + "grad_norm": 0.0020822924561798573, + "learning_rate": 0.004483869030152965, + "loss": 0.0137, + "num_input_tokens_seen": 33359168, + "step": 36880 + }, + { + "epoch": 9.734855483700674, + "grad_norm": 0.0004413714923430234, + "learning_rate": 0.004469585488327904, + "loss": 0.0123, + "num_input_tokens_seen": 33363808, + "step": 36885 + }, + { + "epoch": 9.73617526725617, + "grad_norm": 0.0006911021191626787, + "learning_rate": 0.0044553243891216395, + "loss": 0.0079, + "num_input_tokens_seen": 33368480, + "step": 36890 + }, + { + "epoch": 9.737495050811667, + "grad_norm": 0.005141485016793013, + "learning_rate": 0.004441085734733363, + "loss": 0.0151, + "num_input_tokens_seen": 33372768, + "step": 36895 + }, + { + "epoch": 9.738814834367163, + "grad_norm": 0.0008510564803145826, + "learning_rate": 0.004426869527358884, + "loss": 0.0233, + "num_input_tokens_seen": 33377536, + "step": 36900 + }, + { + "epoch": 9.74013461792266, + "grad_norm": 0.0027197340968996286, + "learning_rate": 0.0044126757691905156, + "loss": 0.0195, + "num_input_tokens_seen": 33382112, + "step": 36905 + }, + { + "epoch": 9.741454401478158, + "grad_norm": 0.0002651086251717061, + "learning_rate": 0.004398504462417107, + "loss": 0.0437, + "num_input_tokens_seen": 33386528, + "step": 36910 + }, + { + "epoch": 9.742774185033655, + "grad_norm": 0.0011269479291513562, + "learning_rate": 0.0043843556092240605, + "loss": 0.0409, + "num_input_tokens_seen": 33391008, + "step": 36915 + }, + { + "epoch": 9.744093968589151, + "grad_norm": 0.005864135455340147, + "learning_rate": 0.004370229211793281, + "loss": 0.0197, + "num_input_tokens_seen": 33395552, + "step": 36920 + }, + { + "epoch": 9.745413752144648, + "grad_norm": 0.001021899632178247, + "learning_rate": 0.0043561252723032405, + "loss": 0.0124, + "num_input_tokens_seen": 33400128, + "step": 36925 + }, + { + "epoch": 9.746733535700145, + "grad_norm": 0.004617548547685146, + "learning_rate": 0.004342043792929001, + "loss": 0.0232, + "num_input_tokens_seen": 33404512, + "step": 36930 + }, + { + "epoch": 9.748053319255643, + "grad_norm": 0.0016084262169897556, + "learning_rate": 0.004327984775842025, + "loss": 0.0125, + "num_input_tokens_seen": 33409088, + "step": 36935 + }, + { + "epoch": 9.74937310281114, + "grad_norm": 0.002261881250888109, + "learning_rate": 0.004313948223210428, + "loss": 0.0097, + "num_input_tokens_seen": 33413504, + "step": 36940 + }, + { + "epoch": 9.750692886366636, + "grad_norm": 0.004794834181666374, + "learning_rate": 0.004299934137198846, + "loss": 0.0162, + "num_input_tokens_seen": 33417856, + "step": 36945 + }, + { + "epoch": 9.752012669922133, + "grad_norm": 0.005629934836179018, + "learning_rate": 0.004285942519968383, + "loss": 0.0135, + "num_input_tokens_seen": 33422848, + "step": 36950 + }, + { + "epoch": 9.75333245347763, + "grad_norm": 0.0013705076416954398, + "learning_rate": 0.004271973373676746, + "loss": 0.0339, + "num_input_tokens_seen": 33427360, + "step": 36955 + }, + { + "epoch": 9.754652237033127, + "grad_norm": 0.00034430381492711604, + "learning_rate": 0.004258026700478146, + "loss": 0.0225, + "num_input_tokens_seen": 33431552, + "step": 36960 + }, + { + "epoch": 9.755972020588624, + "grad_norm": 0.002157797571271658, + "learning_rate": 0.004244102502523328, + "loss": 0.0078, + "num_input_tokens_seen": 33435968, + "step": 36965 + }, + { + "epoch": 9.75729180414412, + "grad_norm": 0.00035975774517282844, + "learning_rate": 0.004230200781959592, + "loss": 0.0251, + "num_input_tokens_seen": 33440512, + "step": 36970 + }, + { + "epoch": 9.758611587699617, + "grad_norm": 0.0024875456001609564, + "learning_rate": 0.004216321540930756, + "loss": 0.0118, + "num_input_tokens_seen": 33444960, + "step": 36975 + }, + { + "epoch": 9.759931371255114, + "grad_norm": 0.0006162413628771901, + "learning_rate": 0.004202464781577175, + "loss": 0.0042, + "num_input_tokens_seen": 33449312, + "step": 36980 + }, + { + "epoch": 9.76125115481061, + "grad_norm": 0.001142297638580203, + "learning_rate": 0.00418863050603574, + "loss": 0.0117, + "num_input_tokens_seen": 33453632, + "step": 36985 + }, + { + "epoch": 9.762570938366109, + "grad_norm": 0.0020919479429721832, + "learning_rate": 0.004174818716439843, + "loss": 0.0225, + "num_input_tokens_seen": 33458464, + "step": 36990 + }, + { + "epoch": 9.763890721921605, + "grad_norm": 0.0009416330140084028, + "learning_rate": 0.004161029414919464, + "loss": 0.0126, + "num_input_tokens_seen": 33463136, + "step": 36995 + }, + { + "epoch": 9.765210505477102, + "grad_norm": 0.0032137606758624315, + "learning_rate": 0.004147262603601071, + "loss": 0.0269, + "num_input_tokens_seen": 33468000, + "step": 37000 + }, + { + "epoch": 9.765210505477102, + "eval_loss": 0.11119474470615387, + "eval_runtime": 75.9002, + "eval_samples_per_second": 88.735, + "eval_steps_per_second": 22.187, + "num_input_tokens_seen": 33468000, + "step": 37000 + }, + { + "epoch": 9.766530289032598, + "grad_norm": 0.0005815200856886804, + "learning_rate": 0.004133518284607679, + "loss": 0.0131, + "num_input_tokens_seen": 33472544, + "step": 37005 + }, + { + "epoch": 9.767850072588095, + "grad_norm": 0.0010160120436921716, + "learning_rate": 0.004119796460058861, + "loss": 0.015, + "num_input_tokens_seen": 33477184, + "step": 37010 + }, + { + "epoch": 9.769169856143593, + "grad_norm": 0.00014108036702964455, + "learning_rate": 0.00410609713207064, + "loss": 0.0213, + "num_input_tokens_seen": 33481472, + "step": 37015 + }, + { + "epoch": 9.77048963969909, + "grad_norm": 0.0003952770202886313, + "learning_rate": 0.004092420302755678, + "loss": 0.0053, + "num_input_tokens_seen": 33485728, + "step": 37020 + }, + { + "epoch": 9.771809423254586, + "grad_norm": 0.00034299332764931023, + "learning_rate": 0.004078765974223103, + "loss": 0.007, + "num_input_tokens_seen": 33490208, + "step": 37025 + }, + { + "epoch": 9.773129206810083, + "grad_norm": 0.0028995845932513475, + "learning_rate": 0.004065134148578564, + "loss": 0.0131, + "num_input_tokens_seen": 33494656, + "step": 37030 + }, + { + "epoch": 9.77444899036558, + "grad_norm": 0.002565915696322918, + "learning_rate": 0.004051524827924279, + "loss": 0.0581, + "num_input_tokens_seen": 33499200, + "step": 37035 + }, + { + "epoch": 9.775768773921078, + "grad_norm": 0.002426102291792631, + "learning_rate": 0.004037938014358955, + "loss": 0.0268, + "num_input_tokens_seen": 33503456, + "step": 37040 + }, + { + "epoch": 9.777088557476574, + "grad_norm": 0.0028943317010998726, + "learning_rate": 0.004024373709977863, + "loss": 0.0074, + "num_input_tokens_seen": 33507840, + "step": 37045 + }, + { + "epoch": 9.77840834103207, + "grad_norm": 0.0005952013889327645, + "learning_rate": 0.004010831916872814, + "loss": 0.0122, + "num_input_tokens_seen": 33512544, + "step": 37050 + }, + { + "epoch": 9.779728124587567, + "grad_norm": 0.0012696763733401895, + "learning_rate": 0.003997312637132089, + "loss": 0.0117, + "num_input_tokens_seen": 33517248, + "step": 37055 + }, + { + "epoch": 9.781047908143064, + "grad_norm": 0.00022036970767658204, + "learning_rate": 0.003983815872840535, + "loss": 0.0324, + "num_input_tokens_seen": 33521728, + "step": 37060 + }, + { + "epoch": 9.782367691698562, + "grad_norm": 0.0001225513988174498, + "learning_rate": 0.003970341626079521, + "loss": 0.0047, + "num_input_tokens_seen": 33526208, + "step": 37065 + }, + { + "epoch": 9.783687475254059, + "grad_norm": 0.0017109751934185624, + "learning_rate": 0.003956889898926952, + "loss": 0.011, + "num_input_tokens_seen": 33530880, + "step": 37070 + }, + { + "epoch": 9.785007258809555, + "grad_norm": 0.005551970563828945, + "learning_rate": 0.0039434606934572675, + "loss": 0.0417, + "num_input_tokens_seen": 33535584, + "step": 37075 + }, + { + "epoch": 9.786327042365052, + "grad_norm": 0.0004413158749230206, + "learning_rate": 0.003930054011741396, + "loss": 0.008, + "num_input_tokens_seen": 33540032, + "step": 37080 + }, + { + "epoch": 9.787646825920548, + "grad_norm": 0.0038993905764073133, + "learning_rate": 0.0039166698558468155, + "loss": 0.015, + "num_input_tokens_seen": 33544576, + "step": 37085 + }, + { + "epoch": 9.788966609476045, + "grad_norm": 0.0006930019590072334, + "learning_rate": 0.0039033082278375594, + "loss": 0.03, + "num_input_tokens_seen": 33549280, + "step": 37090 + }, + { + "epoch": 9.790286393031543, + "grad_norm": 0.00022612581960856915, + "learning_rate": 0.003889969129774112, + "loss": 0.0194, + "num_input_tokens_seen": 33553760, + "step": 37095 + }, + { + "epoch": 9.79160617658704, + "grad_norm": 0.001039173104800284, + "learning_rate": 0.0038766525637135784, + "loss": 0.0047, + "num_input_tokens_seen": 33558592, + "step": 37100 + }, + { + "epoch": 9.792925960142536, + "grad_norm": 0.00023072896874509752, + "learning_rate": 0.0038633585317095318, + "loss": 0.0256, + "num_input_tokens_seen": 33562880, + "step": 37105 + }, + { + "epoch": 9.794245743698033, + "grad_norm": 0.000727908278349787, + "learning_rate": 0.00385008703581205, + "loss": 0.0104, + "num_input_tokens_seen": 33567168, + "step": 37110 + }, + { + "epoch": 9.79556552725353, + "grad_norm": 0.0011125251185148954, + "learning_rate": 0.0038368380780677944, + "loss": 0.0059, + "num_input_tokens_seen": 33571968, + "step": 37115 + }, + { + "epoch": 9.796885310809028, + "grad_norm": 0.004815234337002039, + "learning_rate": 0.003823611660519882, + "loss": 0.0132, + "num_input_tokens_seen": 33576480, + "step": 37120 + }, + { + "epoch": 9.798205094364524, + "grad_norm": 0.0032200831919908524, + "learning_rate": 0.0038104077852080475, + "loss": 0.0467, + "num_input_tokens_seen": 33580832, + "step": 37125 + }, + { + "epoch": 9.799524877920021, + "grad_norm": 0.0025970563292503357, + "learning_rate": 0.003797226454168462, + "loss": 0.0201, + "num_input_tokens_seen": 33585184, + "step": 37130 + }, + { + "epoch": 9.800844661475518, + "grad_norm": 0.0010181106626987457, + "learning_rate": 0.003784067669433849, + "loss": 0.0066, + "num_input_tokens_seen": 33589952, + "step": 37135 + }, + { + "epoch": 9.802164445031014, + "grad_norm": 0.0038973570335656404, + "learning_rate": 0.0037709314330334528, + "loss": 0.0072, + "num_input_tokens_seen": 33594528, + "step": 37140 + }, + { + "epoch": 9.803484228586512, + "grad_norm": 0.00033736470504663885, + "learning_rate": 0.003757817746993086, + "loss": 0.0088, + "num_input_tokens_seen": 33599232, + "step": 37145 + }, + { + "epoch": 9.804804012142009, + "grad_norm": 0.0003603637160267681, + "learning_rate": 0.0037447266133349977, + "loss": 0.0179, + "num_input_tokens_seen": 33603808, + "step": 37150 + }, + { + "epoch": 9.806123795697506, + "grad_norm": 0.0009975662687793374, + "learning_rate": 0.003731658034078039, + "loss": 0.0114, + "num_input_tokens_seen": 33608160, + "step": 37155 + }, + { + "epoch": 9.807443579253002, + "grad_norm": 0.0030462162103503942, + "learning_rate": 0.0037186120112375153, + "loss": 0.0099, + "num_input_tokens_seen": 33612960, + "step": 37160 + }, + { + "epoch": 9.808763362808499, + "grad_norm": 0.0038181261625140905, + "learning_rate": 0.003705588546825317, + "loss": 0.0294, + "num_input_tokens_seen": 33617440, + "step": 37165 + }, + { + "epoch": 9.810083146363997, + "grad_norm": 0.0027229993138462305, + "learning_rate": 0.0036925876428498205, + "loss": 0.0182, + "num_input_tokens_seen": 33622144, + "step": 37170 + }, + { + "epoch": 9.811402929919494, + "grad_norm": 0.00022199592785909772, + "learning_rate": 0.0036796093013159057, + "loss": 0.0084, + "num_input_tokens_seen": 33626624, + "step": 37175 + }, + { + "epoch": 9.81272271347499, + "grad_norm": 0.00018571042164694518, + "learning_rate": 0.0036666535242250217, + "loss": 0.0266, + "num_input_tokens_seen": 33631584, + "step": 37180 + }, + { + "epoch": 9.814042497030487, + "grad_norm": 0.0005958116962574422, + "learning_rate": 0.003653720313575104, + "loss": 0.0055, + "num_input_tokens_seen": 33636096, + "step": 37185 + }, + { + "epoch": 9.815362280585983, + "grad_norm": 0.00031114238663576543, + "learning_rate": 0.003640809671360623, + "loss": 0.0187, + "num_input_tokens_seen": 33640608, + "step": 37190 + }, + { + "epoch": 9.816682064141482, + "grad_norm": 0.0002338719932595268, + "learning_rate": 0.003627921599572553, + "loss": 0.02, + "num_input_tokens_seen": 33645056, + "step": 37195 + }, + { + "epoch": 9.818001847696978, + "grad_norm": 0.0016741367289796472, + "learning_rate": 0.003615056100198405, + "loss": 0.0089, + "num_input_tokens_seen": 33650176, + "step": 37200 + }, + { + "epoch": 9.818001847696978, + "eval_loss": 0.11127901077270508, + "eval_runtime": 75.8072, + "eval_samples_per_second": 88.844, + "eval_steps_per_second": 22.214, + "num_input_tokens_seen": 33650176, + "step": 37200 + }, + { + "epoch": 9.819321631252475, + "grad_norm": 0.001360520487651229, + "learning_rate": 0.003602213175222174, + "loss": 0.0311, + "num_input_tokens_seen": 33654592, + "step": 37205 + }, + { + "epoch": 9.820641414807971, + "grad_norm": 0.00043316135997883976, + "learning_rate": 0.0035893928266244432, + "loss": 0.0106, + "num_input_tokens_seen": 33659232, + "step": 37210 + }, + { + "epoch": 9.821961198363468, + "grad_norm": 0.0030336505733430386, + "learning_rate": 0.003576595056382248, + "loss": 0.0178, + "num_input_tokens_seen": 33663712, + "step": 37215 + }, + { + "epoch": 9.823280981918966, + "grad_norm": 0.005592163186520338, + "learning_rate": 0.0035638198664691423, + "loss": 0.0387, + "num_input_tokens_seen": 33668224, + "step": 37220 + }, + { + "epoch": 9.824600765474463, + "grad_norm": 0.000769327743910253, + "learning_rate": 0.003551067258855267, + "loss": 0.0097, + "num_input_tokens_seen": 33672736, + "step": 37225 + }, + { + "epoch": 9.82592054902996, + "grad_norm": 0.0012864107266068459, + "learning_rate": 0.0035383372355071996, + "loss": 0.0102, + "num_input_tokens_seen": 33677216, + "step": 37230 + }, + { + "epoch": 9.827240332585456, + "grad_norm": 0.0012065684422850609, + "learning_rate": 0.0035256297983881023, + "loss": 0.0088, + "num_input_tokens_seen": 33681536, + "step": 37235 + }, + { + "epoch": 9.828560116140952, + "grad_norm": 0.002741005504503846, + "learning_rate": 0.0035129449494575747, + "loss": 0.0282, + "num_input_tokens_seen": 33685952, + "step": 37240 + }, + { + "epoch": 9.82987989969645, + "grad_norm": 0.0002793163002934307, + "learning_rate": 0.0035002826906718187, + "loss": 0.0131, + "num_input_tokens_seen": 33690560, + "step": 37245 + }, + { + "epoch": 9.831199683251947, + "grad_norm": 0.0005853547481819987, + "learning_rate": 0.003487643023983522, + "loss": 0.0101, + "num_input_tokens_seen": 33694848, + "step": 37250 + }, + { + "epoch": 9.832519466807444, + "grad_norm": 0.0003624087548814714, + "learning_rate": 0.003475025951341842, + "loss": 0.0142, + "num_input_tokens_seen": 33699424, + "step": 37255 + }, + { + "epoch": 9.83383925036294, + "grad_norm": 0.002128340769559145, + "learning_rate": 0.00346243147469249, + "loss": 0.021, + "num_input_tokens_seen": 33703936, + "step": 37260 + }, + { + "epoch": 9.835159033918437, + "grad_norm": 0.000563851383049041, + "learning_rate": 0.0034498595959777446, + "loss": 0.0061, + "num_input_tokens_seen": 33708512, + "step": 37265 + }, + { + "epoch": 9.836478817473933, + "grad_norm": 0.0001659397385083139, + "learning_rate": 0.003437310317136305, + "loss": 0.0072, + "num_input_tokens_seen": 33712800, + "step": 37270 + }, + { + "epoch": 9.837798601029432, + "grad_norm": 0.0024080900475382805, + "learning_rate": 0.0034247836401034236, + "loss": 0.0082, + "num_input_tokens_seen": 33717472, + "step": 37275 + }, + { + "epoch": 9.839118384584928, + "grad_norm": 0.0025375913828611374, + "learning_rate": 0.003412279566810905, + "loss": 0.0246, + "num_input_tokens_seen": 33721856, + "step": 37280 + }, + { + "epoch": 9.840438168140425, + "grad_norm": 0.0003869299835059792, + "learning_rate": 0.00339979809918699, + "loss": 0.0038, + "num_input_tokens_seen": 33726560, + "step": 37285 + }, + { + "epoch": 9.841757951695921, + "grad_norm": 0.0013514786260202527, + "learning_rate": 0.0033873392391565228, + "loss": 0.008, + "num_input_tokens_seen": 33730848, + "step": 37290 + }, + { + "epoch": 9.843077735251418, + "grad_norm": 0.0006167526007629931, + "learning_rate": 0.003374902988640782, + "loss": 0.0081, + "num_input_tokens_seen": 33735360, + "step": 37295 + }, + { + "epoch": 9.844397518806916, + "grad_norm": 0.0012779204407706857, + "learning_rate": 0.0033624893495576014, + "loss": 0.0429, + "num_input_tokens_seen": 33739616, + "step": 37300 + }, + { + "epoch": 9.845717302362413, + "grad_norm": 0.0005207600188441575, + "learning_rate": 0.0033500983238213323, + "loss": 0.0067, + "num_input_tokens_seen": 33744128, + "step": 37305 + }, + { + "epoch": 9.84703708591791, + "grad_norm": 0.0008362211519852281, + "learning_rate": 0.0033377299133428126, + "loss": 0.028, + "num_input_tokens_seen": 33748512, + "step": 37310 + }, + { + "epoch": 9.848356869473406, + "grad_norm": 0.0013107579434290528, + "learning_rate": 0.003325384120029434, + "loss": 0.0082, + "num_input_tokens_seen": 33753184, + "step": 37315 + }, + { + "epoch": 9.849676653028903, + "grad_norm": 0.002083793981000781, + "learning_rate": 0.0033130609457850233, + "loss": 0.0203, + "num_input_tokens_seen": 33757984, + "step": 37320 + }, + { + "epoch": 9.850996436584401, + "grad_norm": 0.0005060575203970075, + "learning_rate": 0.0033007603925100104, + "loss": 0.0151, + "num_input_tokens_seen": 33762496, + "step": 37325 + }, + { + "epoch": 9.852316220139897, + "grad_norm": 0.0031043309718370438, + "learning_rate": 0.003288482462101294, + "loss": 0.0153, + "num_input_tokens_seen": 33766848, + "step": 37330 + }, + { + "epoch": 9.853636003695394, + "grad_norm": 0.001408221898600459, + "learning_rate": 0.0032762271564522605, + "loss": 0.0202, + "num_input_tokens_seen": 33771424, + "step": 37335 + }, + { + "epoch": 9.85495578725089, + "grad_norm": 0.003502791980281472, + "learning_rate": 0.003263994477452864, + "loss": 0.0135, + "num_input_tokens_seen": 33775712, + "step": 37340 + }, + { + "epoch": 9.856275570806387, + "grad_norm": 0.0013964672107249498, + "learning_rate": 0.0032517844269895125, + "loss": 0.0311, + "num_input_tokens_seen": 33780160, + "step": 37345 + }, + { + "epoch": 9.857595354361884, + "grad_norm": 0.0029599678236991167, + "learning_rate": 0.0032395970069451496, + "loss": 0.0211, + "num_input_tokens_seen": 33784992, + "step": 37350 + }, + { + "epoch": 9.858915137917382, + "grad_norm": 0.002447716426104307, + "learning_rate": 0.0032274322191992388, + "loss": 0.0119, + "num_input_tokens_seen": 33789536, + "step": 37355 + }, + { + "epoch": 9.860234921472879, + "grad_norm": 0.0014636594569310546, + "learning_rate": 0.0032152900656277294, + "loss": 0.0371, + "num_input_tokens_seen": 33794208, + "step": 37360 + }, + { + "epoch": 9.861554705028375, + "grad_norm": 0.0006618605693802238, + "learning_rate": 0.0032031705481030902, + "loss": 0.0135, + "num_input_tokens_seen": 33798656, + "step": 37365 + }, + { + "epoch": 9.862874488583872, + "grad_norm": 0.0017613742966204882, + "learning_rate": 0.0031910736684943428, + "loss": 0.0375, + "num_input_tokens_seen": 33803360, + "step": 37370 + }, + { + "epoch": 9.864194272139368, + "grad_norm": 0.0007350879604928195, + "learning_rate": 0.0031789994286669453, + "loss": 0.0066, + "num_input_tokens_seen": 33807968, + "step": 37375 + }, + { + "epoch": 9.865514055694867, + "grad_norm": 0.001578822615556419, + "learning_rate": 0.003166947830482908, + "loss": 0.0112, + "num_input_tokens_seen": 33812832, + "step": 37380 + }, + { + "epoch": 9.866833839250363, + "grad_norm": 0.0014761866768822074, + "learning_rate": 0.003154918875800727, + "loss": 0.01, + "num_input_tokens_seen": 33817152, + "step": 37385 + }, + { + "epoch": 9.86815362280586, + "grad_norm": 0.004175487440079451, + "learning_rate": 0.00314291256647542, + "loss": 0.013, + "num_input_tokens_seen": 33821792, + "step": 37390 + }, + { + "epoch": 9.869473406361356, + "grad_norm": 0.00020780853810720146, + "learning_rate": 0.0031309289043585375, + "loss": 0.0037, + "num_input_tokens_seen": 33826272, + "step": 37395 + }, + { + "epoch": 9.870793189916853, + "grad_norm": 0.0016642662230879068, + "learning_rate": 0.003118967891298069, + "loss": 0.0098, + "num_input_tokens_seen": 33831008, + "step": 37400 + }, + { + "epoch": 9.870793189916853, + "eval_loss": 0.11140962690114975, + "eval_runtime": 75.8979, + "eval_samples_per_second": 88.738, + "eval_steps_per_second": 22.188, + "num_input_tokens_seen": 33831008, + "step": 37400 + }, + { + "epoch": 9.872112973472351, + "grad_norm": 0.0015783020062372088, + "learning_rate": 0.003107029529138572, + "loss": 0.0069, + "num_input_tokens_seen": 33835488, + "step": 37405 + }, + { + "epoch": 9.873432757027848, + "grad_norm": 0.004975730553269386, + "learning_rate": 0.0030951138197211235, + "loss": 0.0126, + "num_input_tokens_seen": 33839776, + "step": 37410 + }, + { + "epoch": 9.874752540583344, + "grad_norm": 0.005017183255404234, + "learning_rate": 0.0030832207648832377, + "loss": 0.0354, + "num_input_tokens_seen": 33844128, + "step": 37415 + }, + { + "epoch": 9.87607232413884, + "grad_norm": 9.908477659337223e-05, + "learning_rate": 0.0030713503664589635, + "loss": 0.0111, + "num_input_tokens_seen": 33848864, + "step": 37420 + }, + { + "epoch": 9.877392107694337, + "grad_norm": 0.0012259563663974404, + "learning_rate": 0.0030595026262788872, + "loss": 0.0079, + "num_input_tokens_seen": 33853056, + "step": 37425 + }, + { + "epoch": 9.878711891249836, + "grad_norm": 0.0014550890773534775, + "learning_rate": 0.00304767754617008, + "loss": 0.0167, + "num_input_tokens_seen": 33857824, + "step": 37430 + }, + { + "epoch": 9.880031674805332, + "grad_norm": 0.006118816323578358, + "learning_rate": 0.003035875127956117, + "loss": 0.0181, + "num_input_tokens_seen": 33862208, + "step": 37435 + }, + { + "epoch": 9.881351458360829, + "grad_norm": 0.0027647721581161022, + "learning_rate": 0.0030240953734570752, + "loss": 0.0377, + "num_input_tokens_seen": 33866656, + "step": 37440 + }, + { + "epoch": 9.882671241916325, + "grad_norm": 0.0008980795391835272, + "learning_rate": 0.003012338284489535, + "loss": 0.0158, + "num_input_tokens_seen": 33871392, + "step": 37445 + }, + { + "epoch": 9.883991025471822, + "grad_norm": 0.0030521859880536795, + "learning_rate": 0.0030006038628665964, + "loss": 0.0098, + "num_input_tokens_seen": 33875616, + "step": 37450 + }, + { + "epoch": 9.88531080902732, + "grad_norm": 0.0008536716923117638, + "learning_rate": 0.002988892110397845, + "loss": 0.021, + "num_input_tokens_seen": 33880608, + "step": 37455 + }, + { + "epoch": 9.886630592582817, + "grad_norm": 0.0009092077380046248, + "learning_rate": 0.0029772030288894025, + "loss": 0.0139, + "num_input_tokens_seen": 33885216, + "step": 37460 + }, + { + "epoch": 9.887950376138313, + "grad_norm": 0.0004046228132210672, + "learning_rate": 0.0029655366201438438, + "loss": 0.025, + "num_input_tokens_seen": 33890112, + "step": 37465 + }, + { + "epoch": 9.88927015969381, + "grad_norm": 0.0007801573956385255, + "learning_rate": 0.0029538928859602965, + "loss": 0.0206, + "num_input_tokens_seen": 33895072, + "step": 37470 + }, + { + "epoch": 9.890589943249307, + "grad_norm": 0.004454105626791716, + "learning_rate": 0.002942271828134374, + "loss": 0.0222, + "num_input_tokens_seen": 33899584, + "step": 37475 + }, + { + "epoch": 9.891909726804805, + "grad_norm": 9.16842109290883e-05, + "learning_rate": 0.00293067344845816, + "loss": 0.0098, + "num_input_tokens_seen": 33904416, + "step": 37480 + }, + { + "epoch": 9.893229510360301, + "grad_norm": 0.00022970532882027328, + "learning_rate": 0.0029190977487202896, + "loss": 0.0163, + "num_input_tokens_seen": 33909024, + "step": 37485 + }, + { + "epoch": 9.894549293915798, + "grad_norm": 0.0017006361158564687, + "learning_rate": 0.0029075447307058853, + "loss": 0.0093, + "num_input_tokens_seen": 33913280, + "step": 37490 + }, + { + "epoch": 9.895869077471295, + "grad_norm": 0.001230491092428565, + "learning_rate": 0.0028960143961965722, + "loss": 0.006, + "num_input_tokens_seen": 33917888, + "step": 37495 + }, + { + "epoch": 9.897188861026791, + "grad_norm": 0.002348696580156684, + "learning_rate": 0.002884506746970461, + "loss": 0.0261, + "num_input_tokens_seen": 33922144, + "step": 37500 + }, + { + "epoch": 9.89850864458229, + "grad_norm": 0.0005300035700201988, + "learning_rate": 0.0028730217848021654, + "loss": 0.0157, + "num_input_tokens_seen": 33926720, + "step": 37505 + }, + { + "epoch": 9.899828428137786, + "grad_norm": 0.005812219809740782, + "learning_rate": 0.0028615595114628188, + "loss": 0.0419, + "num_input_tokens_seen": 33931296, + "step": 37510 + }, + { + "epoch": 9.901148211693283, + "grad_norm": 0.00020319577015470713, + "learning_rate": 0.002850119928720074, + "loss": 0.0059, + "num_input_tokens_seen": 33935712, + "step": 37515 + }, + { + "epoch": 9.902467995248779, + "grad_norm": 0.002418347867205739, + "learning_rate": 0.0028387030383380195, + "loss": 0.0175, + "num_input_tokens_seen": 33940256, + "step": 37520 + }, + { + "epoch": 9.903787778804276, + "grad_norm": 0.0014526972081512213, + "learning_rate": 0.0028273088420772974, + "loss": 0.0235, + "num_input_tokens_seen": 33944800, + "step": 37525 + }, + { + "epoch": 9.905107562359772, + "grad_norm": 0.005609553307294846, + "learning_rate": 0.002815937341695068, + "loss": 0.0278, + "num_input_tokens_seen": 33949120, + "step": 37530 + }, + { + "epoch": 9.90642734591527, + "grad_norm": 0.000910173577722162, + "learning_rate": 0.0028045885389448963, + "loss": 0.0067, + "num_input_tokens_seen": 33953472, + "step": 37535 + }, + { + "epoch": 9.907747129470767, + "grad_norm": 0.0045670135878026485, + "learning_rate": 0.002793262435576965, + "loss": 0.0255, + "num_input_tokens_seen": 33958048, + "step": 37540 + }, + { + "epoch": 9.909066913026264, + "grad_norm": 0.0006611170247197151, + "learning_rate": 0.0027819590333378772, + "loss": 0.006, + "num_input_tokens_seen": 33962976, + "step": 37545 + }, + { + "epoch": 9.91038669658176, + "grad_norm": 0.0004554427578113973, + "learning_rate": 0.002770678333970755, + "loss": 0.0093, + "num_input_tokens_seen": 33967552, + "step": 37550 + }, + { + "epoch": 9.911706480137257, + "grad_norm": 0.0007082667434588075, + "learning_rate": 0.0027594203392152573, + "loss": 0.0207, + "num_input_tokens_seen": 33972320, + "step": 37555 + }, + { + "epoch": 9.913026263692755, + "grad_norm": 0.0003982196212746203, + "learning_rate": 0.002748185050807478, + "loss": 0.0069, + "num_input_tokens_seen": 33977376, + "step": 37560 + }, + { + "epoch": 9.914346047248252, + "grad_norm": 0.0008295847801491618, + "learning_rate": 0.002736972470480031, + "loss": 0.016, + "num_input_tokens_seen": 33981600, + "step": 37565 + }, + { + "epoch": 9.915665830803748, + "grad_norm": 0.0009767517913132906, + "learning_rate": 0.002725782599962068, + "loss": 0.0234, + "num_input_tokens_seen": 33986080, + "step": 37570 + }, + { + "epoch": 9.916985614359245, + "grad_norm": 0.0018441150896251202, + "learning_rate": 0.0027146154409791734, + "loss": 0.0121, + "num_input_tokens_seen": 33990304, + "step": 37575 + }, + { + "epoch": 9.918305397914741, + "grad_norm": 0.0014523951103910804, + "learning_rate": 0.002703470995253504, + "loss": 0.0083, + "num_input_tokens_seen": 33995136, + "step": 37580 + }, + { + "epoch": 9.91962518147024, + "grad_norm": 0.004353400319814682, + "learning_rate": 0.0026923492645036184, + "loss": 0.0336, + "num_input_tokens_seen": 33999552, + "step": 37585 + }, + { + "epoch": 9.920944965025736, + "grad_norm": 0.000930081179831177, + "learning_rate": 0.0026812502504446776, + "loss": 0.0139, + "num_input_tokens_seen": 34003904, + "step": 37590 + }, + { + "epoch": 9.922264748581233, + "grad_norm": 0.0021589098032563925, + "learning_rate": 0.0026701739547882798, + "loss": 0.025, + "num_input_tokens_seen": 34008416, + "step": 37595 + }, + { + "epoch": 9.92358453213673, + "grad_norm": 0.001243119826540351, + "learning_rate": 0.0026591203792425077, + "loss": 0.0086, + "num_input_tokens_seen": 34012992, + "step": 37600 + }, + { + "epoch": 9.92358453213673, + "eval_loss": 0.11133884638547897, + "eval_runtime": 75.991, + "eval_samples_per_second": 88.629, + "eval_steps_per_second": 22.161, + "num_input_tokens_seen": 34012992, + "step": 37600 + }, + { + "epoch": 9.924904315692226, + "grad_norm": 0.00018379921675659716, + "learning_rate": 0.0026480895255119818, + "loss": 0.0233, + "num_input_tokens_seen": 34017440, + "step": 37605 + }, + { + "epoch": 9.926224099247724, + "grad_norm": 0.00022167645511217415, + "learning_rate": 0.002637081395297791, + "loss": 0.0208, + "num_input_tokens_seen": 34021888, + "step": 37610 + }, + { + "epoch": 9.92754388280322, + "grad_norm": 0.0002686524821911007, + "learning_rate": 0.0026260959902975113, + "loss": 0.0051, + "num_input_tokens_seen": 34026560, + "step": 37615 + }, + { + "epoch": 9.928863666358717, + "grad_norm": 0.003959538880735636, + "learning_rate": 0.00261513331220527, + "loss": 0.0171, + "num_input_tokens_seen": 34030880, + "step": 37620 + }, + { + "epoch": 9.930183449914214, + "grad_norm": 0.0019470079569146037, + "learning_rate": 0.0026041933627116154, + "loss": 0.0193, + "num_input_tokens_seen": 34035264, + "step": 37625 + }, + { + "epoch": 9.93150323346971, + "grad_norm": 0.0011900434037670493, + "learning_rate": 0.0025932761435036476, + "loss": 0.0112, + "num_input_tokens_seen": 34039744, + "step": 37630 + }, + { + "epoch": 9.932823017025207, + "grad_norm": 0.0002642254112288356, + "learning_rate": 0.002582381656264904, + "loss": 0.0147, + "num_input_tokens_seen": 34044352, + "step": 37635 + }, + { + "epoch": 9.934142800580705, + "grad_norm": 0.001339005888439715, + "learning_rate": 0.0025715099026754895, + "loss": 0.0153, + "num_input_tokens_seen": 34049024, + "step": 37640 + }, + { + "epoch": 9.935462584136202, + "grad_norm": 3.6888308386551216e-05, + "learning_rate": 0.002560660884411947, + "loss": 0.0085, + "num_input_tokens_seen": 34053472, + "step": 37645 + }, + { + "epoch": 9.936782367691698, + "grad_norm": 0.0015431715874001384, + "learning_rate": 0.0025498346031473385, + "loss": 0.0277, + "num_input_tokens_seen": 34058016, + "step": 37650 + }, + { + "epoch": 9.938102151247195, + "grad_norm": 0.0018426019232720137, + "learning_rate": 0.0025390310605511945, + "loss": 0.0237, + "num_input_tokens_seen": 34062592, + "step": 37655 + }, + { + "epoch": 9.939421934802692, + "grad_norm": 6.090176248108037e-05, + "learning_rate": 0.0025282502582895995, + "loss": 0.0058, + "num_input_tokens_seen": 34067168, + "step": 37660 + }, + { + "epoch": 9.94074171835819, + "grad_norm": 0.0016618171939626336, + "learning_rate": 0.002517492198025023, + "loss": 0.0285, + "num_input_tokens_seen": 34071680, + "step": 37665 + }, + { + "epoch": 9.942061501913686, + "grad_norm": 0.0006538035813719034, + "learning_rate": 0.0025067568814165554, + "loss": 0.0191, + "num_input_tokens_seen": 34076192, + "step": 37670 + }, + { + "epoch": 9.943381285469183, + "grad_norm": 0.0010574444895610213, + "learning_rate": 0.0024960443101196884, + "loss": 0.0083, + "num_input_tokens_seen": 34080576, + "step": 37675 + }, + { + "epoch": 9.94470106902468, + "grad_norm": 0.0039375899359583855, + "learning_rate": 0.002485354485786434, + "loss": 0.0185, + "num_input_tokens_seen": 34085408, + "step": 37680 + }, + { + "epoch": 9.946020852580176, + "grad_norm": 0.0008004137198440731, + "learning_rate": 0.002474687410065307, + "loss": 0.0043, + "num_input_tokens_seen": 34090272, + "step": 37685 + }, + { + "epoch": 9.947340636135674, + "grad_norm": 0.00010020003537647426, + "learning_rate": 0.002464043084601308, + "loss": 0.0158, + "num_input_tokens_seen": 34094624, + "step": 37690 + }, + { + "epoch": 9.948660419691171, + "grad_norm": 0.0002246724907308817, + "learning_rate": 0.0024534215110358915, + "loss": 0.005, + "num_input_tokens_seen": 34098912, + "step": 37695 + }, + { + "epoch": 9.949980203246668, + "grad_norm": 0.001529707107692957, + "learning_rate": 0.002442822691007096, + "loss": 0.0408, + "num_input_tokens_seen": 34103584, + "step": 37700 + }, + { + "epoch": 9.951299986802164, + "grad_norm": 0.001464875997044146, + "learning_rate": 0.002432246626149348, + "loss": 0.0086, + "num_input_tokens_seen": 34108000, + "step": 37705 + }, + { + "epoch": 9.95261977035766, + "grad_norm": 0.0005152446101419628, + "learning_rate": 0.002421693318093626, + "loss": 0.0051, + "num_input_tokens_seen": 34112736, + "step": 37710 + }, + { + "epoch": 9.953939553913159, + "grad_norm": 0.0013435714645311236, + "learning_rate": 0.0024111627684673784, + "loss": 0.0333, + "num_input_tokens_seen": 34117248, + "step": 37715 + }, + { + "epoch": 9.955259337468656, + "grad_norm": 0.0026105542201548815, + "learning_rate": 0.0024006549788945395, + "loss": 0.0273, + "num_input_tokens_seen": 34121696, + "step": 37720 + }, + { + "epoch": 9.956579121024152, + "grad_norm": 0.004845714662224054, + "learning_rate": 0.0023901699509955463, + "loss": 0.0341, + "num_input_tokens_seen": 34126368, + "step": 37725 + }, + { + "epoch": 9.957898904579649, + "grad_norm": 0.0043983496725559235, + "learning_rate": 0.0023797076863873554, + "loss": 0.0141, + "num_input_tokens_seen": 34130976, + "step": 37730 + }, + { + "epoch": 9.959218688135145, + "grad_norm": 0.0034097894094884396, + "learning_rate": 0.0023692681866833262, + "loss": 0.0186, + "num_input_tokens_seen": 34135488, + "step": 37735 + }, + { + "epoch": 9.960538471690644, + "grad_norm": 0.001684707822278142, + "learning_rate": 0.0023588514534934046, + "loss": 0.0112, + "num_input_tokens_seen": 34140576, + "step": 37740 + }, + { + "epoch": 9.96185825524614, + "grad_norm": 0.00026015922776423395, + "learning_rate": 0.002348457488423955, + "loss": 0.017, + "num_input_tokens_seen": 34145120, + "step": 37745 + }, + { + "epoch": 9.963178038801637, + "grad_norm": 0.00480707548558712, + "learning_rate": 0.0023380862930778624, + "loss": 0.0229, + "num_input_tokens_seen": 34149856, + "step": 37750 + }, + { + "epoch": 9.964497822357133, + "grad_norm": 0.001141062588430941, + "learning_rate": 0.0023277378690545135, + "loss": 0.0152, + "num_input_tokens_seen": 34154464, + "step": 37755 + }, + { + "epoch": 9.96581760591263, + "grad_norm": 0.000711045868229121, + "learning_rate": 0.0023174122179497325, + "loss": 0.0051, + "num_input_tokens_seen": 34158496, + "step": 37760 + }, + { + "epoch": 9.967137389468128, + "grad_norm": 0.002383847488090396, + "learning_rate": 0.0023071093413558784, + "loss": 0.017, + "num_input_tokens_seen": 34163168, + "step": 37765 + }, + { + "epoch": 9.968457173023625, + "grad_norm": 0.003365172306075692, + "learning_rate": 0.002296829240861814, + "loss": 0.0109, + "num_input_tokens_seen": 34167904, + "step": 37770 + }, + { + "epoch": 9.969776956579121, + "grad_norm": 0.0007333211251534522, + "learning_rate": 0.002286571918052821, + "loss": 0.0152, + "num_input_tokens_seen": 34172640, + "step": 37775 + }, + { + "epoch": 9.971096740134618, + "grad_norm": 0.0005759427440352738, + "learning_rate": 0.0022763373745107174, + "loss": 0.0053, + "num_input_tokens_seen": 34176960, + "step": 37780 + }, + { + "epoch": 9.972416523690114, + "grad_norm": 0.0006676752236671746, + "learning_rate": 0.0022661256118138074, + "loss": 0.009, + "num_input_tokens_seen": 34181600, + "step": 37785 + }, + { + "epoch": 9.97373630724561, + "grad_norm": 0.0018912235973402858, + "learning_rate": 0.0022559366315368645, + "loss": 0.0082, + "num_input_tokens_seen": 34186176, + "step": 37790 + }, + { + "epoch": 9.97505609080111, + "grad_norm": 0.00032336104777641594, + "learning_rate": 0.002245770435251182, + "loss": 0.0331, + "num_input_tokens_seen": 34190720, + "step": 37795 + }, + { + "epoch": 9.976375874356606, + "grad_norm": 0.0023296368308365345, + "learning_rate": 0.002235627024524456, + "loss": 0.0102, + "num_input_tokens_seen": 34195168, + "step": 37800 + }, + { + "epoch": 9.976375874356606, + "eval_loss": 0.11140639334917068, + "eval_runtime": 75.7844, + "eval_samples_per_second": 88.871, + "eval_steps_per_second": 22.221, + "num_input_tokens_seen": 34195168, + "step": 37800 + }, + { + "epoch": 9.977695657912102, + "grad_norm": 0.002238219603896141, + "learning_rate": 0.0022255064009209847, + "loss": 0.0111, + "num_input_tokens_seen": 34199360, + "step": 37805 + }, + { + "epoch": 9.979015441467599, + "grad_norm": 0.0031470567919313908, + "learning_rate": 0.0022154085660014864, + "loss": 0.0456, + "num_input_tokens_seen": 34203904, + "step": 37810 + }, + { + "epoch": 9.980335225023095, + "grad_norm": 0.0024137881118804216, + "learning_rate": 0.0022053335213231494, + "loss": 0.0207, + "num_input_tokens_seen": 34208384, + "step": 37815 + }, + { + "epoch": 9.981655008578594, + "grad_norm": 0.00011647977953543887, + "learning_rate": 0.002195281268439697, + "loss": 0.0044, + "num_input_tokens_seen": 34212992, + "step": 37820 + }, + { + "epoch": 9.98297479213409, + "grad_norm": 0.00046774459769949317, + "learning_rate": 0.002185251808901306, + "loss": 0.0192, + "num_input_tokens_seen": 34217536, + "step": 37825 + }, + { + "epoch": 9.984294575689587, + "grad_norm": 0.0014185024192556739, + "learning_rate": 0.0021752451442546227, + "loss": 0.0312, + "num_input_tokens_seen": 34221856, + "step": 37830 + }, + { + "epoch": 9.985614359245083, + "grad_norm": 0.0002543860173318535, + "learning_rate": 0.0021652612760428456, + "loss": 0.0226, + "num_input_tokens_seen": 34226368, + "step": 37835 + }, + { + "epoch": 9.98693414280058, + "grad_norm": 0.0011948442552238703, + "learning_rate": 0.0021553002058055603, + "loss": 0.0283, + "num_input_tokens_seen": 34230560, + "step": 37840 + }, + { + "epoch": 9.988253926356078, + "grad_norm": 0.001565119018778205, + "learning_rate": 0.0021453619350789376, + "loss": 0.0129, + "num_input_tokens_seen": 34235008, + "step": 37845 + }, + { + "epoch": 9.989573709911575, + "grad_norm": 0.002411880064755678, + "learning_rate": 0.0021354464653955516, + "loss": 0.0219, + "num_input_tokens_seen": 34239648, + "step": 37850 + }, + { + "epoch": 9.990893493467071, + "grad_norm": 0.00029509200248867273, + "learning_rate": 0.002125553798284513, + "loss": 0.0016, + "num_input_tokens_seen": 34243936, + "step": 37855 + }, + { + "epoch": 9.992213277022568, + "grad_norm": 0.0008330559940077364, + "learning_rate": 0.002115683935271384, + "loss": 0.0212, + "num_input_tokens_seen": 34248480, + "step": 37860 + }, + { + "epoch": 9.993533060578065, + "grad_norm": 0.00030135500128380954, + "learning_rate": 0.0021058368778782144, + "loss": 0.0164, + "num_input_tokens_seen": 34252832, + "step": 37865 + }, + { + "epoch": 9.994852844133563, + "grad_norm": 0.002782933646813035, + "learning_rate": 0.002096012627623539, + "loss": 0.0226, + "num_input_tokens_seen": 34257184, + "step": 37870 + }, + { + "epoch": 9.99617262768906, + "grad_norm": 0.0005224854685366154, + "learning_rate": 0.00208621118602243, + "loss": 0.0161, + "num_input_tokens_seen": 34261856, + "step": 37875 + }, + { + "epoch": 9.997492411244556, + "grad_norm": 0.00042826629942283034, + "learning_rate": 0.002076432554586327, + "loss": 0.0086, + "num_input_tokens_seen": 34266400, + "step": 37880 + }, + { + "epoch": 9.998812194800053, + "grad_norm": 0.0006937733269296587, + "learning_rate": 0.002066676734823258, + "loss": 0.0215, + "num_input_tokens_seen": 34271008, + "step": 37885 + }, + { + "epoch": 10.0, + "grad_norm": 0.004740640055388212, + "learning_rate": 0.0020569437282376866, + "loss": 0.0157, + "num_input_tokens_seen": 34274944, + "step": 37890 + }, + { + "epoch": 10.001319783555497, + "grad_norm": 0.001037153066135943, + "learning_rate": 0.002047233536330545, + "loss": 0.0146, + "num_input_tokens_seen": 34279072, + "step": 37895 + }, + { + "epoch": 10.002639567110993, + "grad_norm": 0.00018641879432834685, + "learning_rate": 0.0020375461605993015, + "loss": 0.0062, + "num_input_tokens_seen": 34283840, + "step": 37900 + }, + { + "epoch": 10.003959350666491, + "grad_norm": 0.00038703266181983054, + "learning_rate": 0.002027881602537845, + "loss": 0.0073, + "num_input_tokens_seen": 34288288, + "step": 37905 + }, + { + "epoch": 10.005279134221988, + "grad_norm": 0.00053436029702425, + "learning_rate": 0.002018239863636567, + "loss": 0.0109, + "num_input_tokens_seen": 34292672, + "step": 37910 + }, + { + "epoch": 10.006598917777485, + "grad_norm": 0.0018630630802363157, + "learning_rate": 0.002008620945382378, + "loss": 0.0066, + "num_input_tokens_seen": 34297216, + "step": 37915 + }, + { + "epoch": 10.007918701332981, + "grad_norm": 0.00013438124733511358, + "learning_rate": 0.001999024849258607, + "loss": 0.0164, + "num_input_tokens_seen": 34301792, + "step": 37920 + }, + { + "epoch": 10.009238484888478, + "grad_norm": 0.0014151346404105425, + "learning_rate": 0.001989451576745105, + "loss": 0.0119, + "num_input_tokens_seen": 34306432, + "step": 37925 + }, + { + "epoch": 10.010558268443976, + "grad_norm": 0.0025365978945046663, + "learning_rate": 0.00197990112931819, + "loss": 0.0155, + "num_input_tokens_seen": 34310656, + "step": 37930 + }, + { + "epoch": 10.011878051999473, + "grad_norm": 0.005136224441230297, + "learning_rate": 0.0019703735084506345, + "loss": 0.0248, + "num_input_tokens_seen": 34315200, + "step": 37935 + }, + { + "epoch": 10.01319783555497, + "grad_norm": 0.0002896749065257609, + "learning_rate": 0.001960868715611763, + "loss": 0.007, + "num_input_tokens_seen": 34319488, + "step": 37940 + }, + { + "epoch": 10.014517619110466, + "grad_norm": 0.0009299790835939348, + "learning_rate": 0.0019513867522673034, + "loss": 0.0035, + "num_input_tokens_seen": 34324064, + "step": 37945 + }, + { + "epoch": 10.015837402665962, + "grad_norm": 0.002545515540987253, + "learning_rate": 0.001941927619879502, + "loss": 0.0103, + "num_input_tokens_seen": 34328256, + "step": 37950 + }, + { + "epoch": 10.01715718622146, + "grad_norm": 0.0006196534377522767, + "learning_rate": 0.0019324913199070758, + "loss": 0.0145, + "num_input_tokens_seen": 34332992, + "step": 37955 + }, + { + "epoch": 10.018476969776957, + "grad_norm": 9.749265154823661e-05, + "learning_rate": 0.0019230778538052106, + "loss": 0.004, + "num_input_tokens_seen": 34337568, + "step": 37960 + }, + { + "epoch": 10.019796753332454, + "grad_norm": 0.0004870948614552617, + "learning_rate": 0.0019136872230255952, + "loss": 0.0258, + "num_input_tokens_seen": 34342112, + "step": 37965 + }, + { + "epoch": 10.02111653688795, + "grad_norm": 0.00044911622535437346, + "learning_rate": 0.0019043194290164045, + "loss": 0.0123, + "num_input_tokens_seen": 34346592, + "step": 37970 + }, + { + "epoch": 10.022436320443447, + "grad_norm": 0.0008818774367682636, + "learning_rate": 0.0018949744732222162, + "loss": 0.0097, + "num_input_tokens_seen": 34350976, + "step": 37975 + }, + { + "epoch": 10.023756103998943, + "grad_norm": 6.436784315155819e-05, + "learning_rate": 0.0018856523570841776, + "loss": 0.0032, + "num_input_tokens_seen": 34355520, + "step": 37980 + }, + { + "epoch": 10.025075887554442, + "grad_norm": 0.0038235618267208338, + "learning_rate": 0.0018763530820398555, + "loss": 0.0156, + "num_input_tokens_seen": 34360224, + "step": 37985 + }, + { + "epoch": 10.026395671109938, + "grad_norm": 0.0006447634659707546, + "learning_rate": 0.0018670766495233525, + "loss": 0.0298, + "num_input_tokens_seen": 34364544, + "step": 37990 + }, + { + "epoch": 10.027715454665435, + "grad_norm": 0.0033513461239635944, + "learning_rate": 0.001857823060965158, + "loss": 0.0158, + "num_input_tokens_seen": 34369280, + "step": 37995 + }, + { + "epoch": 10.029035238220931, + "grad_norm": 0.0017805895768105984, + "learning_rate": 0.0018485923177923467, + "loss": 0.0149, + "num_input_tokens_seen": 34373792, + "step": 38000 + }, + { + "epoch": 10.029035238220931, + "eval_loss": 0.11173610389232635, + "eval_runtime": 75.9614, + "eval_samples_per_second": 88.663, + "eval_steps_per_second": 22.169, + "num_input_tokens_seen": 34373792, + "step": 38000 + }, + { + "epoch": 10.030355021776428, + "grad_norm": 0.0012904390459880233, + "learning_rate": 0.001839384421428364, + "loss": 0.0153, + "num_input_tokens_seen": 34378240, + "step": 38005 + }, + { + "epoch": 10.031674805331926, + "grad_norm": 0.001203139079734683, + "learning_rate": 0.0018301993732932065, + "loss": 0.0039, + "num_input_tokens_seen": 34382752, + "step": 38010 + }, + { + "epoch": 10.032994588887423, + "grad_norm": 0.003520858008414507, + "learning_rate": 0.0018210371748033248, + "loss": 0.0096, + "num_input_tokens_seen": 34387168, + "step": 38015 + }, + { + "epoch": 10.03431437244292, + "grad_norm": 0.0015365451108664274, + "learning_rate": 0.0018118978273716556, + "loss": 0.027, + "num_input_tokens_seen": 34391712, + "step": 38020 + }, + { + "epoch": 10.035634155998416, + "grad_norm": 0.0008308921824209392, + "learning_rate": 0.001802781332407588, + "loss": 0.0082, + "num_input_tokens_seen": 34396192, + "step": 38025 + }, + { + "epoch": 10.036953939553912, + "grad_norm": 0.0003279796801507473, + "learning_rate": 0.0017936876913169806, + "loss": 0.0481, + "num_input_tokens_seen": 34400544, + "step": 38030 + }, + { + "epoch": 10.03827372310941, + "grad_norm": 0.0007010165718384087, + "learning_rate": 0.0017846169055022287, + "loss": 0.0058, + "num_input_tokens_seen": 34405088, + "step": 38035 + }, + { + "epoch": 10.039593506664907, + "grad_norm": 0.00023146066814661026, + "learning_rate": 0.0017755689763621295, + "loss": 0.009, + "num_input_tokens_seen": 34409952, + "step": 38040 + }, + { + "epoch": 10.040913290220404, + "grad_norm": 0.0020549946930259466, + "learning_rate": 0.0017665439052920173, + "loss": 0.0148, + "num_input_tokens_seen": 34414496, + "step": 38045 + }, + { + "epoch": 10.0422330737759, + "grad_norm": 0.00046475575072690845, + "learning_rate": 0.0017575416936836286, + "loss": 0.0131, + "num_input_tokens_seen": 34419296, + "step": 38050 + }, + { + "epoch": 10.043552857331397, + "grad_norm": 0.003257479751482606, + "learning_rate": 0.0017485623429252528, + "loss": 0.0098, + "num_input_tokens_seen": 34423904, + "step": 38055 + }, + { + "epoch": 10.044872640886895, + "grad_norm": 0.0015607794048264623, + "learning_rate": 0.0017396058544016156, + "loss": 0.0229, + "num_input_tokens_seen": 34428480, + "step": 38060 + }, + { + "epoch": 10.046192424442392, + "grad_norm": 0.0038280189037323, + "learning_rate": 0.0017306722294938958, + "loss": 0.0176, + "num_input_tokens_seen": 34432928, + "step": 38065 + }, + { + "epoch": 10.047512207997888, + "grad_norm": 0.002194241853430867, + "learning_rate": 0.0017217614695798078, + "loss": 0.0286, + "num_input_tokens_seen": 34437152, + "step": 38070 + }, + { + "epoch": 10.048831991553385, + "grad_norm": 0.00013228550960775465, + "learning_rate": 0.001712873576033469, + "loss": 0.0073, + "num_input_tokens_seen": 34441600, + "step": 38075 + }, + { + "epoch": 10.050151775108882, + "grad_norm": 0.002891052048653364, + "learning_rate": 0.0017040085502255163, + "loss": 0.0117, + "num_input_tokens_seen": 34446112, + "step": 38080 + }, + { + "epoch": 10.05147155866438, + "grad_norm": 0.0006249333964660764, + "learning_rate": 0.0016951663935230565, + "loss": 0.0187, + "num_input_tokens_seen": 34450560, + "step": 38085 + }, + { + "epoch": 10.052791342219876, + "grad_norm": 0.002403359394520521, + "learning_rate": 0.0016863471072896485, + "loss": 0.0179, + "num_input_tokens_seen": 34455392, + "step": 38090 + }, + { + "epoch": 10.054111125775373, + "grad_norm": 0.00033935162355192006, + "learning_rate": 0.0016775506928853377, + "loss": 0.0077, + "num_input_tokens_seen": 34460032, + "step": 38095 + }, + { + "epoch": 10.05543090933087, + "grad_norm": 0.0010943595552816987, + "learning_rate": 0.001668777151666656, + "loss": 0.0072, + "num_input_tokens_seen": 34464480, + "step": 38100 + }, + { + "epoch": 10.056750692886366, + "grad_norm": 0.002624622778967023, + "learning_rate": 0.0016600264849865709, + "loss": 0.0089, + "num_input_tokens_seen": 34468736, + "step": 38105 + }, + { + "epoch": 10.058070476441863, + "grad_norm": 0.001885056495666504, + "learning_rate": 0.0016512986941945695, + "loss": 0.0204, + "num_input_tokens_seen": 34473152, + "step": 38110 + }, + { + "epoch": 10.059390259997361, + "grad_norm": 0.0002972774382214993, + "learning_rate": 0.0016425937806365753, + "loss": 0.0248, + "num_input_tokens_seen": 34477600, + "step": 38115 + }, + { + "epoch": 10.060710043552858, + "grad_norm": 0.00048179723671637475, + "learning_rate": 0.0016339117456549979, + "loss": 0.0032, + "num_input_tokens_seen": 34482048, + "step": 38120 + }, + { + "epoch": 10.062029827108354, + "grad_norm": 0.0008077442762441933, + "learning_rate": 0.0016252525905886995, + "loss": 0.0128, + "num_input_tokens_seen": 34486624, + "step": 38125 + }, + { + "epoch": 10.06334961066385, + "grad_norm": 0.0007697427063249052, + "learning_rate": 0.0016166163167730617, + "loss": 0.0083, + "num_input_tokens_seen": 34491296, + "step": 38130 + }, + { + "epoch": 10.064669394219347, + "grad_norm": 0.0006645459216088057, + "learning_rate": 0.0016080029255398864, + "loss": 0.0213, + "num_input_tokens_seen": 34495520, + "step": 38135 + }, + { + "epoch": 10.065989177774846, + "grad_norm": 0.0010627517476677895, + "learning_rate": 0.0015994124182174606, + "loss": 0.0052, + "num_input_tokens_seen": 34500192, + "step": 38140 + }, + { + "epoch": 10.067308961330342, + "grad_norm": 0.0001502338272985071, + "learning_rate": 0.001590844796130575, + "loss": 0.0277, + "num_input_tokens_seen": 34504800, + "step": 38145 + }, + { + "epoch": 10.068628744885839, + "grad_norm": 0.00035105712595395744, + "learning_rate": 0.001582300060600439, + "loss": 0.014, + "num_input_tokens_seen": 34509664, + "step": 38150 + }, + { + "epoch": 10.069948528441335, + "grad_norm": 0.00022229809837881476, + "learning_rate": 0.0015737782129447652, + "loss": 0.0045, + "num_input_tokens_seen": 34514016, + "step": 38155 + }, + { + "epoch": 10.071268311996832, + "grad_norm": 0.0002972292131744325, + "learning_rate": 0.0015652792544777361, + "loss": 0.0037, + "num_input_tokens_seen": 34518368, + "step": 38160 + }, + { + "epoch": 10.07258809555233, + "grad_norm": 0.0033438364043831825, + "learning_rate": 0.0015568031865099863, + "loss": 0.0496, + "num_input_tokens_seen": 34522880, + "step": 38165 + }, + { + "epoch": 10.073907879107827, + "grad_norm": 0.0019815051928162575, + "learning_rate": 0.0015483500103486369, + "loss": 0.0139, + "num_input_tokens_seen": 34527488, + "step": 38170 + }, + { + "epoch": 10.075227662663323, + "grad_norm": 0.00037780796992592514, + "learning_rate": 0.0015399197272972787, + "loss": 0.0087, + "num_input_tokens_seen": 34532000, + "step": 38175 + }, + { + "epoch": 10.07654744621882, + "grad_norm": 0.0008366869296878576, + "learning_rate": 0.0015315123386559714, + "loss": 0.0046, + "num_input_tokens_seen": 34536352, + "step": 38180 + }, + { + "epoch": 10.077867229774316, + "grad_norm": 0.0012491467641666532, + "learning_rate": 0.0015231278457212283, + "loss": 0.0108, + "num_input_tokens_seen": 34540704, + "step": 38185 + }, + { + "epoch": 10.079187013329815, + "grad_norm": 2.0325829609646462e-05, + "learning_rate": 0.001514766249786048, + "loss": 0.0239, + "num_input_tokens_seen": 34545120, + "step": 38190 + }, + { + "epoch": 10.080506796885311, + "grad_norm": 0.0013615799834951758, + "learning_rate": 0.0015064275521398994, + "loss": 0.0097, + "num_input_tokens_seen": 34549472, + "step": 38195 + }, + { + "epoch": 10.081826580440808, + "grad_norm": 0.0012206225655972958, + "learning_rate": 0.0014981117540686872, + "loss": 0.014, + "num_input_tokens_seen": 34553856, + "step": 38200 + }, + { + "epoch": 10.081826580440808, + "eval_loss": 0.11127419024705887, + "eval_runtime": 75.8213, + "eval_samples_per_second": 88.827, + "eval_steps_per_second": 22.21, + "num_input_tokens_seen": 34553856, + "step": 38200 + }, + { + "epoch": 10.083146363996304, + "grad_norm": 0.00048130008508451283, + "learning_rate": 0.0014898188568548687, + "loss": 0.0025, + "num_input_tokens_seen": 34558272, + "step": 38205 + }, + { + "epoch": 10.084466147551801, + "grad_norm": 0.0016590588493272662, + "learning_rate": 0.0014815488617772542, + "loss": 0.0085, + "num_input_tokens_seen": 34563200, + "step": 38210 + }, + { + "epoch": 10.0857859311073, + "grad_norm": 0.0038372809067368507, + "learning_rate": 0.0014733017701112072, + "loss": 0.0416, + "num_input_tokens_seen": 34567488, + "step": 38215 + }, + { + "epoch": 10.087105714662796, + "grad_norm": 0.0005471418262459338, + "learning_rate": 0.0014650775831285435, + "loss": 0.015, + "num_input_tokens_seen": 34572320, + "step": 38220 + }, + { + "epoch": 10.088425498218292, + "grad_norm": 0.001224161242134869, + "learning_rate": 0.001456876302097515, + "loss": 0.0254, + "num_input_tokens_seen": 34576832, + "step": 38225 + }, + { + "epoch": 10.089745281773789, + "grad_norm": 0.004239581059664488, + "learning_rate": 0.0014486979282828604, + "loss": 0.0114, + "num_input_tokens_seen": 34581344, + "step": 38230 + }, + { + "epoch": 10.091065065329285, + "grad_norm": 0.0014528821920976043, + "learning_rate": 0.001440542462945804, + "loss": 0.0102, + "num_input_tokens_seen": 34586080, + "step": 38235 + }, + { + "epoch": 10.092384848884782, + "grad_norm": 0.0014084201538935304, + "learning_rate": 0.0014324099073440232, + "loss": 0.0076, + "num_input_tokens_seen": 34590656, + "step": 38240 + }, + { + "epoch": 10.09370463244028, + "grad_norm": 0.0019210968166589737, + "learning_rate": 0.0014243002627316482, + "loss": 0.0242, + "num_input_tokens_seen": 34595008, + "step": 38245 + }, + { + "epoch": 10.095024415995777, + "grad_norm": 0.0006590409902855754, + "learning_rate": 0.0014162135303592781, + "loss": 0.014, + "num_input_tokens_seen": 34599584, + "step": 38250 + }, + { + "epoch": 10.096344199551273, + "grad_norm": 0.0027527029160410166, + "learning_rate": 0.001408149711474016, + "loss": 0.0072, + "num_input_tokens_seen": 34604096, + "step": 38255 + }, + { + "epoch": 10.09766398310677, + "grad_norm": 0.0005517737008631229, + "learning_rate": 0.0014001088073193834, + "loss": 0.0091, + "num_input_tokens_seen": 34608800, + "step": 38260 + }, + { + "epoch": 10.098983766662267, + "grad_norm": 0.0001788503723219037, + "learning_rate": 0.0013920908191354052, + "loss": 0.0075, + "num_input_tokens_seen": 34613120, + "step": 38265 + }, + { + "epoch": 10.100303550217765, + "grad_norm": 0.00029936988721601665, + "learning_rate": 0.001384095748158526, + "loss": 0.0176, + "num_input_tokens_seen": 34617344, + "step": 38270 + }, + { + "epoch": 10.101623333773261, + "grad_norm": 0.00018820709374267608, + "learning_rate": 0.0013761235956217255, + "loss": 0.027, + "num_input_tokens_seen": 34621696, + "step": 38275 + }, + { + "epoch": 10.102943117328758, + "grad_norm": 0.003012278350070119, + "learning_rate": 0.0013681743627543873, + "loss": 0.0156, + "num_input_tokens_seen": 34626112, + "step": 38280 + }, + { + "epoch": 10.104262900884255, + "grad_norm": 0.00040638569043949246, + "learning_rate": 0.001360248050782381, + "loss": 0.0109, + "num_input_tokens_seen": 34630624, + "step": 38285 + }, + { + "epoch": 10.105582684439751, + "grad_norm": 0.002324122004210949, + "learning_rate": 0.001352344660928062, + "loss": 0.0073, + "num_input_tokens_seen": 34635328, + "step": 38290 + }, + { + "epoch": 10.10690246799525, + "grad_norm": 0.0014431591844186187, + "learning_rate": 0.0013444641944102052, + "loss": 0.0081, + "num_input_tokens_seen": 34640064, + "step": 38295 + }, + { + "epoch": 10.108222251550746, + "grad_norm": 0.0008529364131391048, + "learning_rate": 0.0013366066524441056, + "loss": 0.0055, + "num_input_tokens_seen": 34644608, + "step": 38300 + }, + { + "epoch": 10.109542035106243, + "grad_norm": 0.0010161440586671233, + "learning_rate": 0.0013287720362414768, + "loss": 0.0073, + "num_input_tokens_seen": 34649312, + "step": 38305 + }, + { + "epoch": 10.11086181866174, + "grad_norm": 0.0024902489967644215, + "learning_rate": 0.0013209603470105025, + "loss": 0.0505, + "num_input_tokens_seen": 34653728, + "step": 38310 + }, + { + "epoch": 10.112181602217236, + "grad_norm": 0.00108978315256536, + "learning_rate": 0.0013131715859558857, + "loss": 0.0096, + "num_input_tokens_seen": 34658048, + "step": 38315 + }, + { + "epoch": 10.113501385772734, + "grad_norm": 0.000453246699180454, + "learning_rate": 0.001305405754278699, + "loss": 0.0132, + "num_input_tokens_seen": 34662176, + "step": 38320 + }, + { + "epoch": 10.11482116932823, + "grad_norm": 0.0027204283978790045, + "learning_rate": 0.0012976628531765843, + "loss": 0.0126, + "num_input_tokens_seen": 34666816, + "step": 38325 + }, + { + "epoch": 10.116140952883727, + "grad_norm": 0.006006808485835791, + "learning_rate": 0.0012899428838435533, + "loss": 0.0283, + "num_input_tokens_seen": 34671104, + "step": 38330 + }, + { + "epoch": 10.117460736439224, + "grad_norm": 0.00428816070780158, + "learning_rate": 0.001282245847470137, + "loss": 0.0097, + "num_input_tokens_seen": 34675424, + "step": 38335 + }, + { + "epoch": 10.11878051999472, + "grad_norm": 0.001392715610563755, + "learning_rate": 0.001274571745243319, + "loss": 0.0057, + "num_input_tokens_seen": 34679904, + "step": 38340 + }, + { + "epoch": 10.120100303550219, + "grad_norm": 0.0007449755212292075, + "learning_rate": 0.0012669205783465364, + "loss": 0.0055, + "num_input_tokens_seen": 34684384, + "step": 38345 + }, + { + "epoch": 10.121420087105715, + "grad_norm": 0.000471914594527334, + "learning_rate": 0.001259292347959695, + "loss": 0.0022, + "num_input_tokens_seen": 34688736, + "step": 38350 + }, + { + "epoch": 10.122739870661212, + "grad_norm": 0.0005674849380739033, + "learning_rate": 0.0012516870552591707, + "loss": 0.0054, + "num_input_tokens_seen": 34693440, + "step": 38355 + }, + { + "epoch": 10.124059654216708, + "grad_norm": 0.0003411788202356547, + "learning_rate": 0.001244104701417792, + "loss": 0.0134, + "num_input_tokens_seen": 34698464, + "step": 38360 + }, + { + "epoch": 10.125379437772205, + "grad_norm": 0.003428717376664281, + "learning_rate": 0.0012365452876048565, + "loss": 0.0065, + "num_input_tokens_seen": 34702976, + "step": 38365 + }, + { + "epoch": 10.126699221327701, + "grad_norm": 0.0018610862316563725, + "learning_rate": 0.001229008814986099, + "loss": 0.0051, + "num_input_tokens_seen": 34707360, + "step": 38370 + }, + { + "epoch": 10.1280190048832, + "grad_norm": 0.0007895971066318452, + "learning_rate": 0.0012214952847237725, + "loss": 0.0103, + "num_input_tokens_seen": 34711904, + "step": 38375 + }, + { + "epoch": 10.129338788438696, + "grad_norm": 0.002529770601540804, + "learning_rate": 0.0012140046979765339, + "loss": 0.0194, + "num_input_tokens_seen": 34716672, + "step": 38380 + }, + { + "epoch": 10.130658571994193, + "grad_norm": 0.0009470080258324742, + "learning_rate": 0.0012065370558995258, + "loss": 0.0162, + "num_input_tokens_seen": 34721312, + "step": 38385 + }, + { + "epoch": 10.13197835554969, + "grad_norm": 0.0025423525366932154, + "learning_rate": 0.0011990923596443602, + "loss": 0.0192, + "num_input_tokens_seen": 34725856, + "step": 38390 + }, + { + "epoch": 10.133298139105186, + "grad_norm": 0.005289915017783642, + "learning_rate": 0.001191670610359119, + "loss": 0.0208, + "num_input_tokens_seen": 34730496, + "step": 38395 + }, + { + "epoch": 10.134617922660684, + "grad_norm": 0.0005221194587647915, + "learning_rate": 0.0011842718091882865, + "loss": 0.0127, + "num_input_tokens_seen": 34734976, + "step": 38400 + }, + { + "epoch": 10.134617922660684, + "eval_loss": 0.11209084838628769, + "eval_runtime": 75.9082, + "eval_samples_per_second": 88.726, + "eval_steps_per_second": 22.185, + "num_input_tokens_seen": 34734976, + "step": 38400 + }, + { + "epoch": 10.13593770621618, + "grad_norm": 0.0009509850642643869, + "learning_rate": 0.0011768959572729, + "loss": 0.0117, + "num_input_tokens_seen": 34739712, + "step": 38405 + }, + { + "epoch": 10.137257489771677, + "grad_norm": 0.002245575189590454, + "learning_rate": 0.001169543055750366, + "loss": 0.014, + "num_input_tokens_seen": 34743936, + "step": 38410 + }, + { + "epoch": 10.138577273327174, + "grad_norm": 0.0009263523970730603, + "learning_rate": 0.0011622131057546115, + "loss": 0.0108, + "num_input_tokens_seen": 34748320, + "step": 38415 + }, + { + "epoch": 10.13989705688267, + "grad_norm": 0.0038031882140785456, + "learning_rate": 0.0011549061084160316, + "loss": 0.0143, + "num_input_tokens_seen": 34752800, + "step": 38420 + }, + { + "epoch": 10.141216840438169, + "grad_norm": 0.0018155290745198727, + "learning_rate": 0.0011476220648614088, + "loss": 0.0051, + "num_input_tokens_seen": 34756992, + "step": 38425 + }, + { + "epoch": 10.142536623993665, + "grad_norm": 0.004573917016386986, + "learning_rate": 0.0011403609762140777, + "loss": 0.023, + "num_input_tokens_seen": 34761440, + "step": 38430 + }, + { + "epoch": 10.143856407549162, + "grad_norm": 0.0006293572951108217, + "learning_rate": 0.0011331228435937756, + "loss": 0.0037, + "num_input_tokens_seen": 34766016, + "step": 38435 + }, + { + "epoch": 10.145176191104659, + "grad_norm": 0.00023711264657322317, + "learning_rate": 0.0011259076681166935, + "loss": 0.0043, + "num_input_tokens_seen": 34770592, + "step": 38440 + }, + { + "epoch": 10.146495974660155, + "grad_norm": 0.0023452448658645153, + "learning_rate": 0.0011187154508955244, + "loss": 0.0284, + "num_input_tokens_seen": 34774976, + "step": 38445 + }, + { + "epoch": 10.147815758215653, + "grad_norm": 0.00203748419880867, + "learning_rate": 0.001111546193039381, + "loss": 0.0208, + "num_input_tokens_seen": 34779328, + "step": 38450 + }, + { + "epoch": 10.14913554177115, + "grad_norm": 0.0020021938253194094, + "learning_rate": 0.0011043998956538792, + "loss": 0.019, + "num_input_tokens_seen": 34784192, + "step": 38455 + }, + { + "epoch": 10.150455325326647, + "grad_norm": 0.0018432162469252944, + "learning_rate": 0.0010972765598410538, + "loss": 0.0283, + "num_input_tokens_seen": 34788768, + "step": 38460 + }, + { + "epoch": 10.151775108882143, + "grad_norm": 0.0008492256165482104, + "learning_rate": 0.0010901761866993931, + "loss": 0.008, + "num_input_tokens_seen": 34793632, + "step": 38465 + }, + { + "epoch": 10.15309489243764, + "grad_norm": 0.000467837875476107, + "learning_rate": 0.0010830987773238876, + "loss": 0.0346, + "num_input_tokens_seen": 34798368, + "step": 38470 + }, + { + "epoch": 10.154414675993138, + "grad_norm": 0.0015938646392896771, + "learning_rate": 0.0010760443328059644, + "loss": 0.0107, + "num_input_tokens_seen": 34802784, + "step": 38475 + }, + { + "epoch": 10.155734459548635, + "grad_norm": 0.0005703948554582894, + "learning_rate": 0.001069012854233503, + "loss": 0.0059, + "num_input_tokens_seen": 34807232, + "step": 38480 + }, + { + "epoch": 10.157054243104131, + "grad_norm": 0.0033822718542069197, + "learning_rate": 0.0010620043426908365, + "loss": 0.0171, + "num_input_tokens_seen": 34811616, + "step": 38485 + }, + { + "epoch": 10.158374026659628, + "grad_norm": 0.0012470092624425888, + "learning_rate": 0.0010550187992587833, + "loss": 0.009, + "num_input_tokens_seen": 34816288, + "step": 38490 + }, + { + "epoch": 10.159693810215124, + "grad_norm": 0.0025698216632008553, + "learning_rate": 0.0010480562250145653, + "loss": 0.0219, + "num_input_tokens_seen": 34820768, + "step": 38495 + }, + { + "epoch": 10.16101359377062, + "grad_norm": 0.002630834933370352, + "learning_rate": 0.0010411166210319567, + "loss": 0.0178, + "num_input_tokens_seen": 34825632, + "step": 38500 + }, + { + "epoch": 10.162333377326119, + "grad_norm": 0.0012251753360033035, + "learning_rate": 0.0010341999883810848, + "loss": 0.0114, + "num_input_tokens_seen": 34830112, + "step": 38505 + }, + { + "epoch": 10.163653160881616, + "grad_norm": 0.00043971880222670734, + "learning_rate": 0.0010273063281285965, + "loss": 0.0076, + "num_input_tokens_seen": 34834464, + "step": 38510 + }, + { + "epoch": 10.164972944437112, + "grad_norm": 0.00187941815238446, + "learning_rate": 0.0010204356413375747, + "loss": 0.0095, + "num_input_tokens_seen": 34839072, + "step": 38515 + }, + { + "epoch": 10.166292727992609, + "grad_norm": 0.0012757793301716447, + "learning_rate": 0.001013587929067572, + "loss": 0.0097, + "num_input_tokens_seen": 34843648, + "step": 38520 + }, + { + "epoch": 10.167612511548105, + "grad_norm": 0.0006585942464880645, + "learning_rate": 0.00100676319237461, + "loss": 0.0061, + "num_input_tokens_seen": 34848000, + "step": 38525 + }, + { + "epoch": 10.168932295103604, + "grad_norm": 0.00014740203914698213, + "learning_rate": 0.0009999614323110972, + "loss": 0.0057, + "num_input_tokens_seen": 34852608, + "step": 38530 + }, + { + "epoch": 10.1702520786591, + "grad_norm": 0.0013397824950516224, + "learning_rate": 0.000993182649926011, + "loss": 0.0105, + "num_input_tokens_seen": 34856960, + "step": 38535 + }, + { + "epoch": 10.171571862214597, + "grad_norm": 0.0013091567670926452, + "learning_rate": 0.000986426846264682, + "loss": 0.0087, + "num_input_tokens_seen": 34861536, + "step": 38540 + }, + { + "epoch": 10.172891645770093, + "grad_norm": 0.0013562164967879653, + "learning_rate": 0.00097969402236896, + "loss": 0.0086, + "num_input_tokens_seen": 34865888, + "step": 38545 + }, + { + "epoch": 10.17421142932559, + "grad_norm": 0.0009451466030441225, + "learning_rate": 0.0009729841792771143, + "loss": 0.0068, + "num_input_tokens_seen": 34870496, + "step": 38550 + }, + { + "epoch": 10.175531212881088, + "grad_norm": 0.00035453998134471476, + "learning_rate": 0.0009662973180239176, + "loss": 0.0051, + "num_input_tokens_seen": 34875168, + "step": 38555 + }, + { + "epoch": 10.176850996436585, + "grad_norm": 0.0013395763235166669, + "learning_rate": 0.0009596334396405448, + "loss": 0.007, + "num_input_tokens_seen": 34879744, + "step": 38560 + }, + { + "epoch": 10.178170779992081, + "grad_norm": 0.0022494227159768343, + "learning_rate": 0.0009529925451546406, + "loss": 0.0074, + "num_input_tokens_seen": 34884576, + "step": 38565 + }, + { + "epoch": 10.179490563547578, + "grad_norm": 0.002361964900046587, + "learning_rate": 0.0009463746355903357, + "loss": 0.0141, + "num_input_tokens_seen": 34888928, + "step": 38570 + }, + { + "epoch": 10.180810347103074, + "grad_norm": 0.0006635171012021601, + "learning_rate": 0.0009397797119681971, + "loss": 0.0111, + "num_input_tokens_seen": 34893504, + "step": 38575 + }, + { + "epoch": 10.182130130658573, + "grad_norm": 0.0033276663161814213, + "learning_rate": 0.0009332077753052281, + "loss": 0.0106, + "num_input_tokens_seen": 34897984, + "step": 38580 + }, + { + "epoch": 10.18344991421407, + "grad_norm": 0.0004000767949037254, + "learning_rate": 0.0009266588266149011, + "loss": 0.0296, + "num_input_tokens_seen": 34902208, + "step": 38585 + }, + { + "epoch": 10.184769697769566, + "grad_norm": 0.0007477931212633848, + "learning_rate": 0.0009201328669071584, + "loss": 0.0082, + "num_input_tokens_seen": 34906880, + "step": 38590 + }, + { + "epoch": 10.186089481325062, + "grad_norm": 0.000750758801586926, + "learning_rate": 0.0009136298971883949, + "loss": 0.0052, + "num_input_tokens_seen": 34911104, + "step": 38595 + }, + { + "epoch": 10.187409264880559, + "grad_norm": 2.2851614630781114e-05, + "learning_rate": 0.0009071499184614251, + "loss": 0.0143, + "num_input_tokens_seen": 34915936, + "step": 38600 + }, + { + "epoch": 10.187409264880559, + "eval_loss": 0.11171893030405045, + "eval_runtime": 75.9022, + "eval_samples_per_second": 88.733, + "eval_steps_per_second": 22.186, + "num_input_tokens_seen": 34915936, + "step": 38600 + }, + { + "epoch": 10.188729048436057, + "grad_norm": 0.0020432565361261368, + "learning_rate": 0.0009006929317255663, + "loss": 0.0129, + "num_input_tokens_seen": 34920416, + "step": 38605 + }, + { + "epoch": 10.190048831991554, + "grad_norm": 0.00021049029601272196, + "learning_rate": 0.0008942589379765387, + "loss": 0.0027, + "num_input_tokens_seen": 34924960, + "step": 38610 + }, + { + "epoch": 10.19136861554705, + "grad_norm": 0.0009777328232303262, + "learning_rate": 0.0008878479382065817, + "loss": 0.0086, + "num_input_tokens_seen": 34929376, + "step": 38615 + }, + { + "epoch": 10.192688399102547, + "grad_norm": 0.0018702916568145156, + "learning_rate": 0.0008814599334043215, + "loss": 0.0136, + "num_input_tokens_seen": 34933824, + "step": 38620 + }, + { + "epoch": 10.194008182658044, + "grad_norm": 0.0005735793383792043, + "learning_rate": 0.0008750949245548866, + "loss": 0.02, + "num_input_tokens_seen": 34938272, + "step": 38625 + }, + { + "epoch": 10.19532796621354, + "grad_norm": 0.0020881816744804382, + "learning_rate": 0.0008687529126398252, + "loss": 0.0118, + "num_input_tokens_seen": 34942912, + "step": 38630 + }, + { + "epoch": 10.196647749769038, + "grad_norm": 0.0007546623819507658, + "learning_rate": 0.0008624338986371715, + "loss": 0.0094, + "num_input_tokens_seen": 34947584, + "step": 38635 + }, + { + "epoch": 10.197967533324535, + "grad_norm": 0.00014583583106286824, + "learning_rate": 0.0008561378835213962, + "loss": 0.0231, + "num_input_tokens_seen": 34952096, + "step": 38640 + }, + { + "epoch": 10.199287316880032, + "grad_norm": 9.302140097133815e-05, + "learning_rate": 0.0008498648682634058, + "loss": 0.0095, + "num_input_tokens_seen": 34956768, + "step": 38645 + }, + { + "epoch": 10.200607100435528, + "grad_norm": 0.0029044272378087044, + "learning_rate": 0.0008436148538306099, + "loss": 0.0068, + "num_input_tokens_seen": 34961344, + "step": 38650 + }, + { + "epoch": 10.201926883991025, + "grad_norm": 0.0014319696929305792, + "learning_rate": 0.0008373878411868041, + "loss": 0.0089, + "num_input_tokens_seen": 34965792, + "step": 38655 + }, + { + "epoch": 10.203246667546523, + "grad_norm": 0.002713090041652322, + "learning_rate": 0.000831183831292287, + "loss": 0.0132, + "num_input_tokens_seen": 34970240, + "step": 38660 + }, + { + "epoch": 10.20456645110202, + "grad_norm": 0.0009425183525308967, + "learning_rate": 0.0008250028251037933, + "loss": 0.0058, + "num_input_tokens_seen": 34974752, + "step": 38665 + }, + { + "epoch": 10.205886234657516, + "grad_norm": 0.0004921094514429569, + "learning_rate": 0.0008188448235745271, + "loss": 0.0179, + "num_input_tokens_seen": 34979104, + "step": 38670 + }, + { + "epoch": 10.207206018213013, + "grad_norm": 0.0005862296675331891, + "learning_rate": 0.0008127098276541122, + "loss": 0.0067, + "num_input_tokens_seen": 34984096, + "step": 38675 + }, + { + "epoch": 10.20852580176851, + "grad_norm": 0.0011910955654457211, + "learning_rate": 0.0008065978382886418, + "loss": 0.0049, + "num_input_tokens_seen": 34988352, + "step": 38680 + }, + { + "epoch": 10.209845585324008, + "grad_norm": 0.004433650057762861, + "learning_rate": 0.0008005088564206785, + "loss": 0.0171, + "num_input_tokens_seen": 34993088, + "step": 38685 + }, + { + "epoch": 10.211165368879504, + "grad_norm": 0.005166062619537115, + "learning_rate": 0.0007944428829891881, + "loss": 0.0283, + "num_input_tokens_seen": 34997664, + "step": 38690 + }, + { + "epoch": 10.212485152435, + "grad_norm": 0.0004988633445464075, + "learning_rate": 0.0007883999189296386, + "loss": 0.0378, + "num_input_tokens_seen": 35002432, + "step": 38695 + }, + { + "epoch": 10.213804935990497, + "grad_norm": 0.0017908295849338174, + "learning_rate": 0.0007823799651739515, + "loss": 0.0077, + "num_input_tokens_seen": 35006976, + "step": 38700 + }, + { + "epoch": 10.215124719545994, + "grad_norm": 0.0027388192247599363, + "learning_rate": 0.0007763830226504509, + "loss": 0.0128, + "num_input_tokens_seen": 35011456, + "step": 38705 + }, + { + "epoch": 10.216444503101492, + "grad_norm": 0.0005926011363044381, + "learning_rate": 0.0007704090922839468, + "loss": 0.0267, + "num_input_tokens_seen": 35015616, + "step": 38710 + }, + { + "epoch": 10.217764286656989, + "grad_norm": 0.0020084925927221775, + "learning_rate": 0.0007644581749957025, + "loss": 0.0196, + "num_input_tokens_seen": 35020512, + "step": 38715 + }, + { + "epoch": 10.219084070212485, + "grad_norm": 0.0003028182836715132, + "learning_rate": 0.000758530271703417, + "loss": 0.012, + "num_input_tokens_seen": 35025408, + "step": 38720 + }, + { + "epoch": 10.220403853767982, + "grad_norm": 0.0002950546913780272, + "learning_rate": 0.0007526253833212426, + "loss": 0.0415, + "num_input_tokens_seen": 35029696, + "step": 38725 + }, + { + "epoch": 10.221723637323478, + "grad_norm": 0.0002286693488713354, + "learning_rate": 0.0007467435107598008, + "loss": 0.0351, + "num_input_tokens_seen": 35033856, + "step": 38730 + }, + { + "epoch": 10.223043420878977, + "grad_norm": 0.002900082152336836, + "learning_rate": 0.0007408846549261328, + "loss": 0.0306, + "num_input_tokens_seen": 35038304, + "step": 38735 + }, + { + "epoch": 10.224363204434473, + "grad_norm": 0.0006697331555187702, + "learning_rate": 0.0007350488167237656, + "loss": 0.0157, + "num_input_tokens_seen": 35042976, + "step": 38740 + }, + { + "epoch": 10.22568298798997, + "grad_norm": 0.0034307760652154684, + "learning_rate": 0.0007292359970526629, + "loss": 0.0175, + "num_input_tokens_seen": 35047584, + "step": 38745 + }, + { + "epoch": 10.227002771545466, + "grad_norm": 0.0013902949867770076, + "learning_rate": 0.0007234461968092076, + "loss": 0.0058, + "num_input_tokens_seen": 35051904, + "step": 38750 + }, + { + "epoch": 10.228322555100963, + "grad_norm": 0.0018815909279510379, + "learning_rate": 0.0007176794168862854, + "loss": 0.0095, + "num_input_tokens_seen": 35056576, + "step": 38755 + }, + { + "epoch": 10.229642338656461, + "grad_norm": 0.00010389603994553909, + "learning_rate": 0.000711935658173185, + "loss": 0.0031, + "num_input_tokens_seen": 35061280, + "step": 38760 + }, + { + "epoch": 10.230962122211958, + "grad_norm": 0.0008609314099885523, + "learning_rate": 0.0007062149215556812, + "loss": 0.0081, + "num_input_tokens_seen": 35065536, + "step": 38765 + }, + { + "epoch": 10.232281905767454, + "grad_norm": 0.0019339444115757942, + "learning_rate": 0.0007005172079159849, + "loss": 0.0106, + "num_input_tokens_seen": 35070016, + "step": 38770 + }, + { + "epoch": 10.233601689322951, + "grad_norm": 0.0032678900752216578, + "learning_rate": 0.0006948425181327267, + "loss": 0.023, + "num_input_tokens_seen": 35074528, + "step": 38775 + }, + { + "epoch": 10.234921472878447, + "grad_norm": 0.00027515419060364366, + "learning_rate": 0.000689190853081073, + "loss": 0.008, + "num_input_tokens_seen": 35079136, + "step": 38780 + }, + { + "epoch": 10.236241256433944, + "grad_norm": 0.00016335071995854378, + "learning_rate": 0.000683562213632527, + "loss": 0.0167, + "num_input_tokens_seen": 35083712, + "step": 38785 + }, + { + "epoch": 10.237561039989442, + "grad_norm": 0.0023205759935081005, + "learning_rate": 0.0006779566006551108, + "loss": 0.0062, + "num_input_tokens_seen": 35088128, + "step": 38790 + }, + { + "epoch": 10.238880823544939, + "grad_norm": 0.004144672770053148, + "learning_rate": 0.0006723740150132995, + "loss": 0.0179, + "num_input_tokens_seen": 35092640, + "step": 38795 + }, + { + "epoch": 10.240200607100435, + "grad_norm": 0.00030886579770594835, + "learning_rate": 0.0006668144575679713, + "loss": 0.0073, + "num_input_tokens_seen": 35096960, + "step": 38800 + }, + { + "epoch": 10.240200607100435, + "eval_loss": 0.11160609871149063, + "eval_runtime": 76.033, + "eval_samples_per_second": 88.58, + "eval_steps_per_second": 22.148, + "num_input_tokens_seen": 35096960, + "step": 38800 + }, + { + "epoch": 10.241520390655932, + "grad_norm": 0.00019634337513707578, + "learning_rate": 0.0006612779291765069, + "loss": 0.0094, + "num_input_tokens_seen": 35101312, + "step": 38805 + }, + { + "epoch": 10.242840174211429, + "grad_norm": 0.0007426525698974729, + "learning_rate": 0.0006557644306926736, + "loss": 0.0157, + "num_input_tokens_seen": 35105856, + "step": 38810 + }, + { + "epoch": 10.244159957766927, + "grad_norm": 0.0013942570658400655, + "learning_rate": 0.0006502739629667575, + "loss": 0.0097, + "num_input_tokens_seen": 35110816, + "step": 38815 + }, + { + "epoch": 10.245479741322423, + "grad_norm": 0.0022004263009876013, + "learning_rate": 0.0006448065268454317, + "loss": 0.0046, + "num_input_tokens_seen": 35115168, + "step": 38820 + }, + { + "epoch": 10.24679952487792, + "grad_norm": 0.0012147387024015188, + "learning_rate": 0.0006393621231718549, + "loss": 0.011, + "num_input_tokens_seen": 35119680, + "step": 38825 + }, + { + "epoch": 10.248119308433417, + "grad_norm": 0.00042722068610601127, + "learning_rate": 0.0006339407527856389, + "loss": 0.0063, + "num_input_tokens_seen": 35124032, + "step": 38830 + }, + { + "epoch": 10.249439091988913, + "grad_norm": 0.0012704534456133842, + "learning_rate": 0.0006285424165227982, + "loss": 0.0103, + "num_input_tokens_seen": 35128224, + "step": 38835 + }, + { + "epoch": 10.250758875544411, + "grad_norm": 0.004893970210105181, + "learning_rate": 0.0006231671152158169, + "loss": 0.0125, + "num_input_tokens_seen": 35132704, + "step": 38840 + }, + { + "epoch": 10.252078659099908, + "grad_norm": 0.002485675970092416, + "learning_rate": 0.0006178148496936819, + "loss": 0.0152, + "num_input_tokens_seen": 35137280, + "step": 38845 + }, + { + "epoch": 10.253398442655405, + "grad_norm": 0.0013094169553369284, + "learning_rate": 0.000612485620781733, + "loss": 0.0265, + "num_input_tokens_seen": 35141536, + "step": 38850 + }, + { + "epoch": 10.254718226210901, + "grad_norm": 0.0007589819142594934, + "learning_rate": 0.0006071794293018296, + "loss": 0.0152, + "num_input_tokens_seen": 35146016, + "step": 38855 + }, + { + "epoch": 10.256038009766398, + "grad_norm": 0.0021715546026825905, + "learning_rate": 0.0006018962760722501, + "loss": 0.0064, + "num_input_tokens_seen": 35150496, + "step": 38860 + }, + { + "epoch": 10.257357793321896, + "grad_norm": 0.0006066461210139096, + "learning_rate": 0.0005966361619077098, + "loss": 0.0069, + "num_input_tokens_seen": 35154976, + "step": 38865 + }, + { + "epoch": 10.258677576877393, + "grad_norm": 0.002905123634263873, + "learning_rate": 0.000591399087619393, + "loss": 0.0267, + "num_input_tokens_seen": 35159456, + "step": 38870 + }, + { + "epoch": 10.25999736043289, + "grad_norm": 0.00036940324935130775, + "learning_rate": 0.0005861850540149371, + "loss": 0.015, + "num_input_tokens_seen": 35163776, + "step": 38875 + }, + { + "epoch": 10.261317143988386, + "grad_norm": 0.004588534589856863, + "learning_rate": 0.0005809940618983822, + "loss": 0.0205, + "num_input_tokens_seen": 35168384, + "step": 38880 + }, + { + "epoch": 10.262636927543882, + "grad_norm": 0.002184451324865222, + "learning_rate": 0.0005758261120702712, + "loss": 0.0073, + "num_input_tokens_seen": 35172832, + "step": 38885 + }, + { + "epoch": 10.263956711099379, + "grad_norm": 0.000890616443939507, + "learning_rate": 0.0005706812053275501, + "loss": 0.0135, + "num_input_tokens_seen": 35177344, + "step": 38890 + }, + { + "epoch": 10.265276494654877, + "grad_norm": 0.002759612165391445, + "learning_rate": 0.0005655593424636173, + "loss": 0.0165, + "num_input_tokens_seen": 35181696, + "step": 38895 + }, + { + "epoch": 10.266596278210374, + "grad_norm": 0.001497076009400189, + "learning_rate": 0.0005604605242683746, + "loss": 0.0175, + "num_input_tokens_seen": 35186112, + "step": 38900 + }, + { + "epoch": 10.26791606176587, + "grad_norm": 0.001148729119449854, + "learning_rate": 0.0005553847515280596, + "loss": 0.0097, + "num_input_tokens_seen": 35190528, + "step": 38905 + }, + { + "epoch": 10.269235845321367, + "grad_norm": 0.0017533444333821535, + "learning_rate": 0.0005503320250254795, + "loss": 0.0056, + "num_input_tokens_seen": 35194976, + "step": 38910 + }, + { + "epoch": 10.270555628876863, + "grad_norm": 0.0008598456624895334, + "learning_rate": 0.0005453023455397943, + "loss": 0.0033, + "num_input_tokens_seen": 35199552, + "step": 38915 + }, + { + "epoch": 10.271875412432362, + "grad_norm": 0.0004560522793326527, + "learning_rate": 0.0005402957138466502, + "loss": 0.0224, + "num_input_tokens_seen": 35204352, + "step": 38920 + }, + { + "epoch": 10.273195195987858, + "grad_norm": 0.004232468549162149, + "learning_rate": 0.0005353121307181463, + "loss": 0.0202, + "num_input_tokens_seen": 35208832, + "step": 38925 + }, + { + "epoch": 10.274514979543355, + "grad_norm": 0.002507344353944063, + "learning_rate": 0.0005303515969227845, + "loss": 0.0101, + "num_input_tokens_seen": 35213184, + "step": 38930 + }, + { + "epoch": 10.275834763098851, + "grad_norm": 0.0006860227440483868, + "learning_rate": 0.0005254141132255862, + "loss": 0.0318, + "num_input_tokens_seen": 35217440, + "step": 38935 + }, + { + "epoch": 10.277154546654348, + "grad_norm": 0.004155730828642845, + "learning_rate": 0.0005204996803879258, + "loss": 0.0137, + "num_input_tokens_seen": 35222112, + "step": 38940 + }, + { + "epoch": 10.278474330209846, + "grad_norm": 0.002235573949292302, + "learning_rate": 0.0005156082991676969, + "loss": 0.0197, + "num_input_tokens_seen": 35226752, + "step": 38945 + }, + { + "epoch": 10.279794113765343, + "grad_norm": 0.0029481202363967896, + "learning_rate": 0.0005107399703192127, + "loss": 0.018, + "num_input_tokens_seen": 35231584, + "step": 38950 + }, + { + "epoch": 10.28111389732084, + "grad_norm": 0.0013322103768587112, + "learning_rate": 0.0005058946945932063, + "loss": 0.0122, + "num_input_tokens_seen": 35236032, + "step": 38955 + }, + { + "epoch": 10.282433680876336, + "grad_norm": 0.00339678768068552, + "learning_rate": 0.0005010724727369131, + "loss": 0.013, + "num_input_tokens_seen": 35240352, + "step": 38960 + }, + { + "epoch": 10.283753464431832, + "grad_norm": 0.00017273098637815565, + "learning_rate": 0.000496273305493955, + "loss": 0.0098, + "num_input_tokens_seen": 35245184, + "step": 38965 + }, + { + "epoch": 10.28507324798733, + "grad_norm": 0.0009767054580152035, + "learning_rate": 0.0004914971936044399, + "loss": 0.0243, + "num_input_tokens_seen": 35249696, + "step": 38970 + }, + { + "epoch": 10.286393031542827, + "grad_norm": 0.0003676658379845321, + "learning_rate": 0.00048674413780491196, + "loss": 0.0043, + "num_input_tokens_seen": 35254432, + "step": 38975 + }, + { + "epoch": 10.287712815098324, + "grad_norm": 0.0003831555077340454, + "learning_rate": 0.0004820141388283183, + "loss": 0.0359, + "num_input_tokens_seen": 35258528, + "step": 38980 + }, + { + "epoch": 10.28903259865382, + "grad_norm": 0.005001319106668234, + "learning_rate": 0.00047730719740410874, + "loss": 0.008, + "num_input_tokens_seen": 35262720, + "step": 38985 + }, + { + "epoch": 10.290352382209317, + "grad_norm": 0.00032830226700752974, + "learning_rate": 0.00047262331425816927, + "loss": 0.0044, + "num_input_tokens_seen": 35267232, + "step": 38990 + }, + { + "epoch": 10.291672165764815, + "grad_norm": 0.0008826713310554624, + "learning_rate": 0.00046796249011277213, + "loss": 0.0027, + "num_input_tokens_seen": 35271936, + "step": 38995 + }, + { + "epoch": 10.292991949320312, + "grad_norm": 0.00047061993973329663, + "learning_rate": 0.00046332472568669236, + "loss": 0.0124, + "num_input_tokens_seen": 35276448, + "step": 39000 + }, + { + "epoch": 10.292991949320312, + "eval_loss": 0.11206556856632233, + "eval_runtime": 75.8645, + "eval_samples_per_second": 88.777, + "eval_steps_per_second": 22.197, + "num_input_tokens_seen": 35276448, + "step": 39000 + }, + { + "epoch": 10.294311732875808, + "grad_norm": 0.0030519880820065737, + "learning_rate": 0.0004587100216951578, + "loss": 0.0195, + "num_input_tokens_seen": 35280896, + "step": 39005 + }, + { + "epoch": 10.295631516431305, + "grad_norm": 0.003656503977254033, + "learning_rate": 0.00045411837884978265, + "loss": 0.0337, + "num_input_tokens_seen": 35285504, + "step": 39010 + }, + { + "epoch": 10.296951299986802, + "grad_norm": 0.00126769975759089, + "learning_rate": 0.00044954979785865045, + "loss": 0.0045, + "num_input_tokens_seen": 35290272, + "step": 39015 + }, + { + "epoch": 10.2982710835423, + "grad_norm": 2.1220157577772625e-05, + "learning_rate": 0.00044500427942631426, + "loss": 0.0199, + "num_input_tokens_seen": 35294784, + "step": 39020 + }, + { + "epoch": 10.299590867097796, + "grad_norm": 0.0005085199954919517, + "learning_rate": 0.0004404818242537467, + "loss": 0.0125, + "num_input_tokens_seen": 35299328, + "step": 39025 + }, + { + "epoch": 10.300910650653293, + "grad_norm": 0.0031687042210251093, + "learning_rate": 0.00043598243303837324, + "loss": 0.011, + "num_input_tokens_seen": 35303840, + "step": 39030 + }, + { + "epoch": 10.30223043420879, + "grad_norm": 0.0017896204954013228, + "learning_rate": 0.00043150610647403885, + "loss": 0.0155, + "num_input_tokens_seen": 35308416, + "step": 39035 + }, + { + "epoch": 10.303550217764286, + "grad_norm": 0.000771115708630532, + "learning_rate": 0.00042705284525104134, + "loss": 0.015, + "num_input_tokens_seen": 35312960, + "step": 39040 + }, + { + "epoch": 10.304870001319783, + "grad_norm": 0.0004174084751866758, + "learning_rate": 0.0004226226500561647, + "loss": 0.0092, + "num_input_tokens_seen": 35317504, + "step": 39045 + }, + { + "epoch": 10.306189784875281, + "grad_norm": 0.001670117606408894, + "learning_rate": 0.0004182155215725791, + "loss": 0.0131, + "num_input_tokens_seen": 35322080, + "step": 39050 + }, + { + "epoch": 10.307509568430778, + "grad_norm": 0.00040360589628107846, + "learning_rate": 0.00041383146047992424, + "loss": 0.0038, + "num_input_tokens_seen": 35326752, + "step": 39055 + }, + { + "epoch": 10.308829351986274, + "grad_norm": 0.0005944407894276083, + "learning_rate": 0.00040947046745427597, + "loss": 0.0135, + "num_input_tokens_seen": 35331552, + "step": 39060 + }, + { + "epoch": 10.31014913554177, + "grad_norm": 0.004200811497867107, + "learning_rate": 0.00040513254316814625, + "loss": 0.0266, + "num_input_tokens_seen": 35335648, + "step": 39065 + }, + { + "epoch": 10.311468919097267, + "grad_norm": 0.00025990756694227457, + "learning_rate": 0.0004008176882905168, + "loss": 0.0097, + "num_input_tokens_seen": 35340544, + "step": 39070 + }, + { + "epoch": 10.312788702652766, + "grad_norm": 0.0007579321973025799, + "learning_rate": 0.00039652590348677184, + "loss": 0.009, + "num_input_tokens_seen": 35345280, + "step": 39075 + }, + { + "epoch": 10.314108486208262, + "grad_norm": 0.0017048756126314402, + "learning_rate": 0.00039225718941878206, + "loss": 0.0129, + "num_input_tokens_seen": 35349568, + "step": 39080 + }, + { + "epoch": 10.315428269763759, + "grad_norm": 0.0018636091845110059, + "learning_rate": 0.00038801154674480417, + "loss": 0.0151, + "num_input_tokens_seen": 35353888, + "step": 39085 + }, + { + "epoch": 10.316748053319255, + "grad_norm": 0.00043535116128623486, + "learning_rate": 0.00038378897611959784, + "loss": 0.0031, + "num_input_tokens_seen": 35358752, + "step": 39090 + }, + { + "epoch": 10.318067836874752, + "grad_norm": 0.0013281158171594143, + "learning_rate": 0.00037958947819430875, + "loss": 0.0464, + "num_input_tokens_seen": 35363168, + "step": 39095 + }, + { + "epoch": 10.31938762043025, + "grad_norm": 0.000983810517936945, + "learning_rate": 0.0003754130536165856, + "loss": 0.0158, + "num_input_tokens_seen": 35367616, + "step": 39100 + }, + { + "epoch": 10.320707403985747, + "grad_norm": 0.00043315155198797584, + "learning_rate": 0.0003712597030304632, + "loss": 0.0048, + "num_input_tokens_seen": 35372000, + "step": 39105 + }, + { + "epoch": 10.322027187541243, + "grad_norm": 0.005058465525507927, + "learning_rate": 0.00036712942707646247, + "loss": 0.0213, + "num_input_tokens_seen": 35376416, + "step": 39110 + }, + { + "epoch": 10.32334697109674, + "grad_norm": 0.005135838873684406, + "learning_rate": 0.00036302222639149063, + "loss": 0.0218, + "num_input_tokens_seen": 35380704, + "step": 39115 + }, + { + "epoch": 10.324666754652236, + "grad_norm": 0.00035601749550551176, + "learning_rate": 0.000358938101608941, + "loss": 0.0251, + "num_input_tokens_seen": 35385120, + "step": 39120 + }, + { + "epoch": 10.325986538207735, + "grad_norm": 0.0002861733373720199, + "learning_rate": 0.0003548770533586598, + "loss": 0.0298, + "num_input_tokens_seen": 35389472, + "step": 39125 + }, + { + "epoch": 10.327306321763231, + "grad_norm": 0.0010340233566239476, + "learning_rate": 0.0003508390822668961, + "loss": 0.0264, + "num_input_tokens_seen": 35394176, + "step": 39130 + }, + { + "epoch": 10.328626105318728, + "grad_norm": 0.00018750988238025457, + "learning_rate": 0.00034682418895633503, + "loss": 0.0081, + "num_input_tokens_seen": 35398720, + "step": 39135 + }, + { + "epoch": 10.329945888874224, + "grad_norm": 0.003001391189172864, + "learning_rate": 0.0003428323740461647, + "loss": 0.0214, + "num_input_tokens_seen": 35403008, + "step": 39140 + }, + { + "epoch": 10.331265672429721, + "grad_norm": 0.0017339049372822046, + "learning_rate": 0.00033886363815194276, + "loss": 0.0135, + "num_input_tokens_seen": 35407360, + "step": 39145 + }, + { + "epoch": 10.33258545598522, + "grad_norm": 0.0019869080279022455, + "learning_rate": 0.0003349179818857129, + "loss": 0.0074, + "num_input_tokens_seen": 35411680, + "step": 39150 + }, + { + "epoch": 10.333905239540716, + "grad_norm": 0.0008870497695170343, + "learning_rate": 0.0003309954058559383, + "loss": 0.0099, + "num_input_tokens_seen": 35416096, + "step": 39155 + }, + { + "epoch": 10.335225023096212, + "grad_norm": 0.00014123671280685812, + "learning_rate": 0.0003270959106675186, + "loss": 0.0113, + "num_input_tokens_seen": 35420672, + "step": 39160 + }, + { + "epoch": 10.336544806651709, + "grad_norm": 0.00834434200078249, + "learning_rate": 0.0003232194969218227, + "loss": 0.027, + "num_input_tokens_seen": 35425312, + "step": 39165 + }, + { + "epoch": 10.337864590207205, + "grad_norm": 0.0028741653077304363, + "learning_rate": 0.00031936616521663905, + "loss": 0.0228, + "num_input_tokens_seen": 35429952, + "step": 39170 + }, + { + "epoch": 10.339184373762702, + "grad_norm": 0.0008751270943321288, + "learning_rate": 0.00031553591614619236, + "loss": 0.0073, + "num_input_tokens_seen": 35434464, + "step": 39175 + }, + { + "epoch": 10.3405041573182, + "grad_norm": 0.005043539218604565, + "learning_rate": 0.00031172875030117676, + "loss": 0.0235, + "num_input_tokens_seen": 35439008, + "step": 39180 + }, + { + "epoch": 10.341823940873697, + "grad_norm": 9.997194865718484e-05, + "learning_rate": 0.0003079446682686726, + "loss": 0.005, + "num_input_tokens_seen": 35443456, + "step": 39185 + }, + { + "epoch": 10.343143724429193, + "grad_norm": 0.0008160749566741288, + "learning_rate": 0.0003041836706322465, + "loss": 0.0056, + "num_input_tokens_seen": 35447872, + "step": 39190 + }, + { + "epoch": 10.34446350798469, + "grad_norm": 0.0007676410605199635, + "learning_rate": 0.0003004457579719011, + "loss": 0.0057, + "num_input_tokens_seen": 35452512, + "step": 39195 + }, + { + "epoch": 10.345783291540187, + "grad_norm": 0.0006611092248931527, + "learning_rate": 0.00029673093086405867, + "loss": 0.0143, + "num_input_tokens_seen": 35457088, + "step": 39200 + }, + { + "epoch": 10.345783291540187, + "eval_loss": 0.11201956123113632, + "eval_runtime": 75.9695, + "eval_samples_per_second": 88.654, + "eval_steps_per_second": 22.167, + "num_input_tokens_seen": 35457088, + "step": 39200 + }, + { + "epoch": 10.347103075095685, + "grad_norm": 0.004558449611067772, + "learning_rate": 0.00029303918988159426, + "loss": 0.0346, + "num_input_tokens_seen": 35461728, + "step": 39205 + }, + { + "epoch": 10.348422858651181, + "grad_norm": 0.006348281167447567, + "learning_rate": 0.0002893705355938192, + "loss": 0.0161, + "num_input_tokens_seen": 35466080, + "step": 39210 + }, + { + "epoch": 10.349742642206678, + "grad_norm": 0.0005845670821145177, + "learning_rate": 0.0002857249685664975, + "loss": 0.0027, + "num_input_tokens_seen": 35470496, + "step": 39215 + }, + { + "epoch": 10.351062425762175, + "grad_norm": 0.006436718627810478, + "learning_rate": 0.0002821024893618129, + "loss": 0.0147, + "num_input_tokens_seen": 35475200, + "step": 39220 + }, + { + "epoch": 10.352382209317671, + "grad_norm": 0.00019143495592288673, + "learning_rate": 0.0002785030985383852, + "loss": 0.0072, + "num_input_tokens_seen": 35479488, + "step": 39225 + }, + { + "epoch": 10.35370199287317, + "grad_norm": 0.00015114981215447187, + "learning_rate": 0.00027492679665130356, + "loss": 0.0108, + "num_input_tokens_seen": 35484160, + "step": 39230 + }, + { + "epoch": 10.355021776428666, + "grad_norm": 0.0016506884712725878, + "learning_rate": 0.000271373584252077, + "loss": 0.0153, + "num_input_tokens_seen": 35488960, + "step": 39235 + }, + { + "epoch": 10.356341559984163, + "grad_norm": 0.0012634481536224484, + "learning_rate": 0.00026784346188865046, + "loss": 0.0125, + "num_input_tokens_seen": 35493216, + "step": 39240 + }, + { + "epoch": 10.35766134353966, + "grad_norm": 0.001871624612249434, + "learning_rate": 0.0002643364301054218, + "loss": 0.009, + "num_input_tokens_seen": 35497728, + "step": 39245 + }, + { + "epoch": 10.358981127095156, + "grad_norm": 0.0023707314394414425, + "learning_rate": 0.0002608524894431918, + "loss": 0.0089, + "num_input_tokens_seen": 35502432, + "step": 39250 + }, + { + "epoch": 10.360300910650654, + "grad_norm": 0.0005127167678438127, + "learning_rate": 0.000257391640439264, + "loss": 0.0105, + "num_input_tokens_seen": 35506560, + "step": 39255 + }, + { + "epoch": 10.36162069420615, + "grad_norm": 0.0006164090591482818, + "learning_rate": 0.00025395388362732806, + "loss": 0.0123, + "num_input_tokens_seen": 35511328, + "step": 39260 + }, + { + "epoch": 10.362940477761647, + "grad_norm": 0.002178631257265806, + "learning_rate": 0.00025053921953751, + "loss": 0.0112, + "num_input_tokens_seen": 35516064, + "step": 39265 + }, + { + "epoch": 10.364260261317144, + "grad_norm": 0.00029026184347458184, + "learning_rate": 0.00024714764869643855, + "loss": 0.0166, + "num_input_tokens_seen": 35520576, + "step": 39270 + }, + { + "epoch": 10.36558004487264, + "grad_norm": 0.0017823486123234034, + "learning_rate": 0.0002437791716270954, + "loss": 0.0141, + "num_input_tokens_seen": 35525312, + "step": 39275 + }, + { + "epoch": 10.366899828428139, + "grad_norm": 0.0012472121743485332, + "learning_rate": 0.00024043378884896493, + "loss": 0.0313, + "num_input_tokens_seen": 35529632, + "step": 39280 + }, + { + "epoch": 10.368219611983635, + "grad_norm": 0.00023019639775156975, + "learning_rate": 0.00023711150087793453, + "loss": 0.008, + "num_input_tokens_seen": 35534144, + "step": 39285 + }, + { + "epoch": 10.369539395539132, + "grad_norm": 0.0007680275593884289, + "learning_rate": 0.000233812308226361, + "loss": 0.0227, + "num_input_tokens_seen": 35538336, + "step": 39290 + }, + { + "epoch": 10.370859179094628, + "grad_norm": 7.511608419008553e-05, + "learning_rate": 0.00023053621140300406, + "loss": 0.0177, + "num_input_tokens_seen": 35543104, + "step": 39295 + }, + { + "epoch": 10.372178962650125, + "grad_norm": 0.0004113980976399034, + "learning_rate": 0.00022728321091307623, + "loss": 0.0123, + "num_input_tokens_seen": 35547232, + "step": 39300 + }, + { + "epoch": 10.373498746205623, + "grad_norm": 0.002316125901415944, + "learning_rate": 0.0002240533072582429, + "loss": 0.0189, + "num_input_tokens_seen": 35551840, + "step": 39305 + }, + { + "epoch": 10.37481852976112, + "grad_norm": 0.0003722446272149682, + "learning_rate": 0.00022084650093658897, + "loss": 0.0203, + "num_input_tokens_seen": 35556224, + "step": 39310 + }, + { + "epoch": 10.376138313316616, + "grad_norm": 0.0001485916000092402, + "learning_rate": 0.0002176627924426522, + "loss": 0.0113, + "num_input_tokens_seen": 35560768, + "step": 39315 + }, + { + "epoch": 10.377458096872113, + "grad_norm": 0.0012952717952430248, + "learning_rate": 0.0002145021822673898, + "loss": 0.0173, + "num_input_tokens_seen": 35565344, + "step": 39320 + }, + { + "epoch": 10.37877788042761, + "grad_norm": 0.0004214668879285455, + "learning_rate": 0.00021136467089822862, + "loss": 0.0141, + "num_input_tokens_seen": 35570016, + "step": 39325 + }, + { + "epoch": 10.380097663983106, + "grad_norm": 0.0003649118007160723, + "learning_rate": 0.00020825025881898162, + "loss": 0.0125, + "num_input_tokens_seen": 35574560, + "step": 39330 + }, + { + "epoch": 10.381417447538604, + "grad_norm": 0.00024380229297094047, + "learning_rate": 0.0002051589465099479, + "loss": 0.0103, + "num_input_tokens_seen": 35579264, + "step": 39335 + }, + { + "epoch": 10.3827372310941, + "grad_norm": 0.0007878852775320411, + "learning_rate": 0.0002020907344478462, + "loss": 0.0073, + "num_input_tokens_seen": 35583840, + "step": 39340 + }, + { + "epoch": 10.384057014649597, + "grad_norm": 0.00044183325371704996, + "learning_rate": 0.0001990456231058313, + "loss": 0.0125, + "num_input_tokens_seen": 35588256, + "step": 39345 + }, + { + "epoch": 10.385376798205094, + "grad_norm": 0.00046660625957883894, + "learning_rate": 0.00019602361295349423, + "loss": 0.0065, + "num_input_tokens_seen": 35592928, + "step": 39350 + }, + { + "epoch": 10.38669658176059, + "grad_norm": 0.0005918201059103012, + "learning_rate": 0.0001930247044568789, + "loss": 0.0234, + "num_input_tokens_seen": 35597600, + "step": 39355 + }, + { + "epoch": 10.388016365316089, + "grad_norm": 0.0030397260561585426, + "learning_rate": 0.00019004889807843205, + "loss": 0.012, + "num_input_tokens_seen": 35602080, + "step": 39360 + }, + { + "epoch": 10.389336148871585, + "grad_norm": 0.0014697221340611577, + "learning_rate": 0.00018709619427708656, + "loss": 0.0206, + "num_input_tokens_seen": 35606336, + "step": 39365 + }, + { + "epoch": 10.390655932427082, + "grad_norm": 0.0034342934377491474, + "learning_rate": 0.00018416659350817822, + "loss": 0.017, + "num_input_tokens_seen": 35610656, + "step": 39370 + }, + { + "epoch": 10.391975715982579, + "grad_norm": 0.0002869288146030158, + "learning_rate": 0.00018126009622346229, + "loss": 0.0082, + "num_input_tokens_seen": 35615072, + "step": 39375 + }, + { + "epoch": 10.393295499538075, + "grad_norm": 0.003289605025202036, + "learning_rate": 0.00017837670287119687, + "loss": 0.0186, + "num_input_tokens_seen": 35619456, + "step": 39380 + }, + { + "epoch": 10.394615283093573, + "grad_norm": 0.0008386396802961826, + "learning_rate": 0.00017551641389602633, + "loss": 0.0084, + "num_input_tokens_seen": 35624256, + "step": 39385 + }, + { + "epoch": 10.39593506664907, + "grad_norm": 0.0006883898167870939, + "learning_rate": 0.00017267922973903115, + "loss": 0.0031, + "num_input_tokens_seen": 35628928, + "step": 39390 + }, + { + "epoch": 10.397254850204567, + "grad_norm": 0.0032189530320465565, + "learning_rate": 0.00016986515083774467, + "loss": 0.0147, + "num_input_tokens_seen": 35633248, + "step": 39395 + }, + { + "epoch": 10.398574633760063, + "grad_norm": 0.004700306802988052, + "learning_rate": 0.00016707417762611975, + "loss": 0.0212, + "num_input_tokens_seen": 35637600, + "step": 39400 + }, + { + "epoch": 10.398574633760063, + "eval_loss": 0.11227244138717651, + "eval_runtime": 76.0224, + "eval_samples_per_second": 88.592, + "eval_steps_per_second": 22.151, + "num_input_tokens_seen": 35637600, + "step": 39400 + }, + { + "epoch": 10.39989441731556, + "grad_norm": 0.0017803533701226115, + "learning_rate": 0.00016430631053459543, + "loss": 0.0067, + "num_input_tokens_seen": 35642176, + "step": 39405 + }, + { + "epoch": 10.401214200871058, + "grad_norm": 0.0014338345499709249, + "learning_rate": 0.0001615615499899803, + "loss": 0.0057, + "num_input_tokens_seen": 35646688, + "step": 39410 + }, + { + "epoch": 10.402533984426555, + "grad_norm": 0.001342545379884541, + "learning_rate": 0.00015883989641556905, + "loss": 0.0068, + "num_input_tokens_seen": 35651232, + "step": 39415 + }, + { + "epoch": 10.403853767982051, + "grad_norm": 0.0006197613547556102, + "learning_rate": 0.00015614135023105934, + "loss": 0.012, + "num_input_tokens_seen": 35655808, + "step": 39420 + }, + { + "epoch": 10.405173551537548, + "grad_norm": 0.0001657996472204104, + "learning_rate": 0.00015346591185261827, + "loss": 0.004, + "num_input_tokens_seen": 35660512, + "step": 39425 + }, + { + "epoch": 10.406493335093044, + "grad_norm": 0.00026713203988038003, + "learning_rate": 0.00015081358169281576, + "loss": 0.0109, + "num_input_tokens_seen": 35665344, + "step": 39430 + }, + { + "epoch": 10.40781311864854, + "grad_norm": 0.00029198304400779307, + "learning_rate": 0.00014818436016069135, + "loss": 0.0171, + "num_input_tokens_seen": 35669536, + "step": 39435 + }, + { + "epoch": 10.409132902204039, + "grad_norm": 0.00020783858781214803, + "learning_rate": 0.00014557824766168735, + "loss": 0.0095, + "num_input_tokens_seen": 35674112, + "step": 39440 + }, + { + "epoch": 10.410452685759536, + "grad_norm": 0.005030041094869375, + "learning_rate": 0.00014299524459769896, + "loss": 0.03, + "num_input_tokens_seen": 35678752, + "step": 39445 + }, + { + "epoch": 10.411772469315032, + "grad_norm": 0.0035042972303926945, + "learning_rate": 0.0001404353513670742, + "loss": 0.0252, + "num_input_tokens_seen": 35683072, + "step": 39450 + }, + { + "epoch": 10.413092252870529, + "grad_norm": 0.006309468764811754, + "learning_rate": 0.0001378985683645806, + "loss": 0.0208, + "num_input_tokens_seen": 35687552, + "step": 39455 + }, + { + "epoch": 10.414412036426025, + "grad_norm": 0.0012074130354449153, + "learning_rate": 0.0001353848959813886, + "loss": 0.0221, + "num_input_tokens_seen": 35692160, + "step": 39460 + }, + { + "epoch": 10.415731819981524, + "grad_norm": 0.0003811799979303032, + "learning_rate": 0.00013289433460517142, + "loss": 0.0103, + "num_input_tokens_seen": 35696512, + "step": 39465 + }, + { + "epoch": 10.41705160353702, + "grad_norm": 0.0013172314502298832, + "learning_rate": 0.00013042688462000518, + "loss": 0.0207, + "num_input_tokens_seen": 35701184, + "step": 39470 + }, + { + "epoch": 10.418371387092517, + "grad_norm": 0.0013123748358339071, + "learning_rate": 0.0001279825464063855, + "loss": 0.0178, + "num_input_tokens_seen": 35705664, + "step": 39475 + }, + { + "epoch": 10.419691170648013, + "grad_norm": 0.00034244608832523227, + "learning_rate": 0.00012556132034126087, + "loss": 0.0286, + "num_input_tokens_seen": 35710144, + "step": 39480 + }, + { + "epoch": 10.42101095420351, + "grad_norm": 0.0012596402084454894, + "learning_rate": 0.0001231632067980326, + "loss": 0.0057, + "num_input_tokens_seen": 35714656, + "step": 39485 + }, + { + "epoch": 10.422330737759008, + "grad_norm": 9.133598359767348e-05, + "learning_rate": 0.00012078820614650486, + "loss": 0.0072, + "num_input_tokens_seen": 35719040, + "step": 39490 + }, + { + "epoch": 10.423650521314505, + "grad_norm": 0.0005071734194643795, + "learning_rate": 0.00011843631875291804, + "loss": 0.0218, + "num_input_tokens_seen": 35723520, + "step": 39495 + }, + { + "epoch": 10.424970304870001, + "grad_norm": 0.00176374358125031, + "learning_rate": 0.00011610754497999863, + "loss": 0.0257, + "num_input_tokens_seen": 35728160, + "step": 39500 + }, + { + "epoch": 10.426290088425498, + "grad_norm": 0.005954073742032051, + "learning_rate": 0.0001138018851868594, + "loss": 0.0243, + "num_input_tokens_seen": 35732352, + "step": 39505 + }, + { + "epoch": 10.427609871980994, + "grad_norm": 0.0004850683908443898, + "learning_rate": 0.0001115193397290326, + "loss": 0.0232, + "num_input_tokens_seen": 35737088, + "step": 39510 + }, + { + "epoch": 10.428929655536493, + "grad_norm": 0.0010474026203155518, + "learning_rate": 0.00010925990895856996, + "loss": 0.0385, + "num_input_tokens_seen": 35741248, + "step": 39515 + }, + { + "epoch": 10.43024943909199, + "grad_norm": 6.636718899244443e-05, + "learning_rate": 0.00010702359322385946, + "loss": 0.0048, + "num_input_tokens_seen": 35745600, + "step": 39520 + }, + { + "epoch": 10.431569222647486, + "grad_norm": 0.0006500810850411654, + "learning_rate": 0.00010481039286977523, + "loss": 0.0081, + "num_input_tokens_seen": 35750432, + "step": 39525 + }, + { + "epoch": 10.432889006202982, + "grad_norm": 0.00250276830047369, + "learning_rate": 0.00010262030823764423, + "loss": 0.0178, + "num_input_tokens_seen": 35755072, + "step": 39530 + }, + { + "epoch": 10.434208789758479, + "grad_norm": 0.0008693505660630763, + "learning_rate": 0.00010045333966517966, + "loss": 0.0079, + "num_input_tokens_seen": 35759520, + "step": 39535 + }, + { + "epoch": 10.435528573313977, + "grad_norm": 0.00020022994431201369, + "learning_rate": 9.83094874865642e-05, + "loss": 0.006, + "num_input_tokens_seen": 35764160, + "step": 39540 + }, + { + "epoch": 10.436848356869474, + "grad_norm": 0.004341818857938051, + "learning_rate": 9.618875203241672e-05, + "loss": 0.0262, + "num_input_tokens_seen": 35768768, + "step": 39545 + }, + { + "epoch": 10.43816814042497, + "grad_norm": 8.305795199703425e-05, + "learning_rate": 9.409113362977561e-05, + "loss": 0.0074, + "num_input_tokens_seen": 35773184, + "step": 39550 + }, + { + "epoch": 10.439487923980467, + "grad_norm": 0.0010761701269075274, + "learning_rate": 9.20166326020988e-05, + "loss": 0.0039, + "num_input_tokens_seen": 35777408, + "step": 39555 + }, + { + "epoch": 10.440807707535964, + "grad_norm": 0.0017772153951227665, + "learning_rate": 8.996524926933035e-05, + "loss": 0.0064, + "num_input_tokens_seen": 35781952, + "step": 39560 + }, + { + "epoch": 10.442127491091462, + "grad_norm": 0.0012189416447654366, + "learning_rate": 8.793698394781723e-05, + "loss": 0.0055, + "num_input_tokens_seen": 35786240, + "step": 39565 + }, + { + "epoch": 10.443447274646958, + "grad_norm": 1.7172715160995722e-05, + "learning_rate": 8.593183695030926e-05, + "loss": 0.0032, + "num_input_tokens_seen": 35790496, + "step": 39570 + }, + { + "epoch": 10.444767058202455, + "grad_norm": 0.0020804477389901876, + "learning_rate": 8.39498085860757e-05, + "loss": 0.0121, + "num_input_tokens_seen": 35795072, + "step": 39575 + }, + { + "epoch": 10.446086841757952, + "grad_norm": 0.0001494797324994579, + "learning_rate": 8.199089916072211e-05, + "loss": 0.0091, + "num_input_tokens_seen": 35799648, + "step": 39580 + }, + { + "epoch": 10.447406625313448, + "grad_norm": 0.0024934306275099516, + "learning_rate": 8.005510897637346e-05, + "loss": 0.0162, + "num_input_tokens_seen": 35804224, + "step": 39585 + }, + { + "epoch": 10.448726408868945, + "grad_norm": 0.0008960596169345081, + "learning_rate": 7.8142438331541e-05, + "loss": 0.0215, + "num_input_tokens_seen": 35808864, + "step": 39590 + }, + { + "epoch": 10.450046192424443, + "grad_norm": 0.004689362365752459, + "learning_rate": 7.625288752117209e-05, + "loss": 0.0271, + "num_input_tokens_seen": 35813504, + "step": 39595 + }, + { + "epoch": 10.45136597597994, + "grad_norm": 0.0007709207129664719, + "learning_rate": 7.4386456836667e-05, + "loss": 0.0083, + "num_input_tokens_seen": 35817824, + "step": 39600 + }, + { + "epoch": 10.45136597597994, + "eval_loss": 0.11193311214447021, + "eval_runtime": 75.9913, + "eval_samples_per_second": 88.629, + "eval_steps_per_second": 22.16, + "num_input_tokens_seen": 35817824, + "step": 39600 + }, + { + "epoch": 10.452685759535436, + "grad_norm": 0.002727712271735072, + "learning_rate": 7.254314656586214e-05, + "loss": 0.0231, + "num_input_tokens_seen": 35822336, + "step": 39605 + }, + { + "epoch": 10.454005543090933, + "grad_norm": 0.0008389765862375498, + "learning_rate": 7.07229569929968e-05, + "loss": 0.0093, + "num_input_tokens_seen": 35826560, + "step": 39610 + }, + { + "epoch": 10.45532532664643, + "grad_norm": 0.0010216768132522702, + "learning_rate": 6.892588839879643e-05, + "loss": 0.0123, + "num_input_tokens_seen": 35831104, + "step": 39615 + }, + { + "epoch": 10.456645110201928, + "grad_norm": 0.0023717412259429693, + "learning_rate": 6.71519410603727e-05, + "loss": 0.0094, + "num_input_tokens_seen": 35836032, + "step": 39620 + }, + { + "epoch": 10.457964893757424, + "grad_norm": 0.005895301233977079, + "learning_rate": 6.540111525129011e-05, + "loss": 0.0302, + "num_input_tokens_seen": 35840608, + "step": 39625 + }, + { + "epoch": 10.45928467731292, + "grad_norm": 0.0022102375514805317, + "learning_rate": 6.367341124154934e-05, + "loss": 0.0111, + "num_input_tokens_seen": 35845280, + "step": 39630 + }, + { + "epoch": 10.460604460868417, + "grad_norm": 0.0007001257035881281, + "learning_rate": 6.19688292975873e-05, + "loss": 0.0168, + "num_input_tokens_seen": 35849984, + "step": 39635 + }, + { + "epoch": 10.461924244423914, + "grad_norm": 0.00032891155569814146, + "learning_rate": 6.0287369682260336e-05, + "loss": 0.0105, + "num_input_tokens_seen": 35854464, + "step": 39640 + }, + { + "epoch": 10.463244027979412, + "grad_norm": 0.00012896783300675452, + "learning_rate": 5.8629032654894384e-05, + "loss": 0.0155, + "num_input_tokens_seen": 35858848, + "step": 39645 + }, + { + "epoch": 10.464563811534909, + "grad_norm": 0.0032833744771778584, + "learning_rate": 5.699381847120155e-05, + "loss": 0.008, + "num_input_tokens_seen": 35863488, + "step": 39650 + }, + { + "epoch": 10.465883595090405, + "grad_norm": 0.0006129530956968665, + "learning_rate": 5.5381727383380094e-05, + "loss": 0.0107, + "num_input_tokens_seen": 35868288, + "step": 39655 + }, + { + "epoch": 10.467203378645902, + "grad_norm": 0.0017788332188501954, + "learning_rate": 5.379275964001451e-05, + "loss": 0.0093, + "num_input_tokens_seen": 35872832, + "step": 39660 + }, + { + "epoch": 10.468523162201398, + "grad_norm": 7.497891056118533e-05, + "learning_rate": 5.222691548614211e-05, + "loss": 0.015, + "num_input_tokens_seen": 35877216, + "step": 39665 + }, + { + "epoch": 10.469842945756897, + "grad_norm": 0.0008287960663437843, + "learning_rate": 5.068419516323641e-05, + "loss": 0.0058, + "num_input_tokens_seen": 35881824, + "step": 39670 + }, + { + "epoch": 10.471162729312393, + "grad_norm": 0.0007330512162297964, + "learning_rate": 4.91645989092071e-05, + "loss": 0.0149, + "num_input_tokens_seen": 35886432, + "step": 39675 + }, + { + "epoch": 10.47248251286789, + "grad_norm": 0.002974939998239279, + "learning_rate": 4.7668126958400056e-05, + "loss": 0.0142, + "num_input_tokens_seen": 35890912, + "step": 39680 + }, + { + "epoch": 10.473802296423386, + "grad_norm": 6.180930358823389e-05, + "learning_rate": 4.619477954159734e-05, + "loss": 0.0042, + "num_input_tokens_seen": 35895648, + "step": 39685 + }, + { + "epoch": 10.475122079978883, + "grad_norm": 0.00263988203369081, + "learning_rate": 4.4744556885983884e-05, + "loss": 0.0099, + "num_input_tokens_seen": 35900160, + "step": 39690 + }, + { + "epoch": 10.47644186353438, + "grad_norm": 0.0014002256793901324, + "learning_rate": 4.331745921523078e-05, + "loss": 0.0074, + "num_input_tokens_seen": 35904576, + "step": 39695 + }, + { + "epoch": 10.477761647089878, + "grad_norm": 0.0004275303508620709, + "learning_rate": 4.191348674937867e-05, + "loss": 0.0098, + "num_input_tokens_seen": 35909056, + "step": 39700 + }, + { + "epoch": 10.479081430645374, + "grad_norm": 0.0004611637268681079, + "learning_rate": 4.0532639704971006e-05, + "loss": 0.0055, + "num_input_tokens_seen": 35913312, + "step": 39705 + }, + { + "epoch": 10.480401214200871, + "grad_norm": 0.0001460127969039604, + "learning_rate": 3.917491829493747e-05, + "loss": 0.0268, + "num_input_tokens_seen": 35917632, + "step": 39710 + }, + { + "epoch": 10.481720997756367, + "grad_norm": 0.0001267077896045521, + "learning_rate": 3.78403227286439e-05, + "loss": 0.0059, + "num_input_tokens_seen": 35922432, + "step": 39715 + }, + { + "epoch": 10.483040781311864, + "grad_norm": 0.0009346899460069835, + "learning_rate": 3.652885321192567e-05, + "loss": 0.0046, + "num_input_tokens_seen": 35926752, + "step": 39720 + }, + { + "epoch": 10.484360564867362, + "grad_norm": 0.0008689966052770615, + "learning_rate": 3.524050994702099e-05, + "loss": 0.0091, + "num_input_tokens_seen": 35931328, + "step": 39725 + }, + { + "epoch": 10.485680348422859, + "grad_norm": 0.0010430210968479514, + "learning_rate": 3.3975293132604276e-05, + "loss": 0.0057, + "num_input_tokens_seen": 35936032, + "step": 39730 + }, + { + "epoch": 10.487000131978355, + "grad_norm": 0.00012571245315484703, + "learning_rate": 3.2733202963786125e-05, + "loss": 0.0044, + "num_input_tokens_seen": 35940768, + "step": 39735 + }, + { + "epoch": 10.488319915533852, + "grad_norm": 0.0006517459405586123, + "learning_rate": 3.15142396321133e-05, + "loss": 0.0218, + "num_input_tokens_seen": 35945536, + "step": 39740 + }, + { + "epoch": 10.489639699089349, + "grad_norm": 0.001330663333646953, + "learning_rate": 3.0318403325552132e-05, + "loss": 0.0353, + "num_input_tokens_seen": 35950016, + "step": 39745 + }, + { + "epoch": 10.490959482644847, + "grad_norm": 0.003864794736728072, + "learning_rate": 2.914569422855506e-05, + "loss": 0.0099, + "num_input_tokens_seen": 35954304, + "step": 39750 + }, + { + "epoch": 10.492279266200343, + "grad_norm": 0.00012589708785526454, + "learning_rate": 2.7996112521927462e-05, + "loss": 0.012, + "num_input_tokens_seen": 35958720, + "step": 39755 + }, + { + "epoch": 10.49359904975584, + "grad_norm": 0.00026846505352295935, + "learning_rate": 2.68696583829775e-05, + "loss": 0.0255, + "num_input_tokens_seen": 35963328, + "step": 39760 + }, + { + "epoch": 10.494918833311337, + "grad_norm": 0.001407917938195169, + "learning_rate": 2.576633198539957e-05, + "loss": 0.0069, + "num_input_tokens_seen": 35967744, + "step": 39765 + }, + { + "epoch": 10.496238616866833, + "grad_norm": 0.0029415267053991556, + "learning_rate": 2.46861334993409e-05, + "loss": 0.0159, + "num_input_tokens_seen": 35972320, + "step": 39770 + }, + { + "epoch": 10.497558400422331, + "grad_norm": 0.0008293110295198858, + "learning_rate": 2.3629063091384903e-05, + "loss": 0.0079, + "num_input_tokens_seen": 35976960, + "step": 39775 + }, + { + "epoch": 10.498878183977828, + "grad_norm": 0.003402974223718047, + "learning_rate": 2.2595120924567834e-05, + "loss": 0.0189, + "num_input_tokens_seen": 35981472, + "step": 39780 + }, + { + "epoch": 10.500197967533325, + "grad_norm": 0.0005261172773316503, + "learning_rate": 2.158430715829551e-05, + "loss": 0.0209, + "num_input_tokens_seen": 35985984, + "step": 39785 + }, + { + "epoch": 10.501517751088821, + "grad_norm": 0.0020777343306690454, + "learning_rate": 2.059662194849321e-05, + "loss": 0.0065, + "num_input_tokens_seen": 35990592, + "step": 39790 + }, + { + "epoch": 10.502837534644318, + "grad_norm": 0.00023580659762956202, + "learning_rate": 1.9632065447422463e-05, + "loss": 0.0137, + "num_input_tokens_seen": 35994880, + "step": 39795 + }, + { + "epoch": 10.504157318199816, + "grad_norm": 0.0009463938768021762, + "learning_rate": 1.8690637803880916e-05, + "loss": 0.0159, + "num_input_tokens_seen": 35999840, + "step": 39800 + }, + { + "epoch": 10.504157318199816, + "eval_loss": 0.1114056259393692, + "eval_runtime": 75.879, + "eval_samples_per_second": 88.76, + "eval_steps_per_second": 22.193, + "num_input_tokens_seen": 35999840, + "step": 39800 + }, + { + "epoch": 10.505477101755313, + "grad_norm": 0.001933524152263999, + "learning_rate": 1.7772339163019123e-05, + "loss": 0.0065, + "num_input_tokens_seen": 36004288, + "step": 39805 + }, + { + "epoch": 10.50679688531081, + "grad_norm": 0.0016536012990400195, + "learning_rate": 1.6877169666457138e-05, + "loss": 0.0121, + "num_input_tokens_seen": 36008672, + "step": 39810 + }, + { + "epoch": 10.508116668866306, + "grad_norm": 0.00025447524967603385, + "learning_rate": 1.6005129452234532e-05, + "loss": 0.0146, + "num_input_tokens_seen": 36013312, + "step": 39815 + }, + { + "epoch": 10.509436452421802, + "grad_norm": 0.0006643570377491415, + "learning_rate": 1.5156218654843733e-05, + "loss": 0.0092, + "num_input_tokens_seen": 36017824, + "step": 39820 + }, + { + "epoch": 10.5107562359773, + "grad_norm": 0.0006930578383617103, + "learning_rate": 1.4330437405196683e-05, + "loss": 0.0114, + "num_input_tokens_seen": 36022496, + "step": 39825 + }, + { + "epoch": 10.512076019532797, + "grad_norm": 0.0021062646992504597, + "learning_rate": 1.352778583062486e-05, + "loss": 0.0107, + "num_input_tokens_seen": 36027168, + "step": 39830 + }, + { + "epoch": 10.513395803088294, + "grad_norm": 0.0011566628236323595, + "learning_rate": 1.2748264054929237e-05, + "loss": 0.0082, + "num_input_tokens_seen": 36031648, + "step": 39835 + }, + { + "epoch": 10.51471558664379, + "grad_norm": 0.001137306448072195, + "learning_rate": 1.1991872198297004e-05, + "loss": 0.0109, + "num_input_tokens_seen": 36035936, + "step": 39840 + }, + { + "epoch": 10.516035370199287, + "grad_norm": 5.563245213124901e-05, + "learning_rate": 1.1258610377384847e-05, + "loss": 0.0032, + "num_input_tokens_seen": 36040480, + "step": 39845 + }, + { + "epoch": 10.517355153754785, + "grad_norm": 0.0011757775209844112, + "learning_rate": 1.0548478705268982e-05, + "loss": 0.012, + "num_input_tokens_seen": 36044896, + "step": 39850 + }, + { + "epoch": 10.518674937310282, + "grad_norm": 0.0008029053569771349, + "learning_rate": 9.86147729147846e-06, + "loss": 0.0069, + "num_input_tokens_seen": 36049344, + "step": 39855 + }, + { + "epoch": 10.519994720865778, + "grad_norm": 0.0038669693749397993, + "learning_rate": 9.197606241928557e-06, + "loss": 0.011, + "num_input_tokens_seen": 36053792, + "step": 39860 + }, + { + "epoch": 10.521314504421275, + "grad_norm": 0.0006893714307807386, + "learning_rate": 8.556865659004042e-06, + "loss": 0.0057, + "num_input_tokens_seen": 36058272, + "step": 39865 + }, + { + "epoch": 10.522634287976771, + "grad_norm": 0.0014481511898338795, + "learning_rate": 7.939255641525867e-06, + "loss": 0.0203, + "num_input_tokens_seen": 36062880, + "step": 39870 + }, + { + "epoch": 10.523954071532268, + "grad_norm": 0.0017264453927055001, + "learning_rate": 7.344776284751164e-06, + "loss": 0.0168, + "num_input_tokens_seen": 36067488, + "step": 39875 + }, + { + "epoch": 10.525273855087766, + "grad_norm": 0.0007202457636594772, + "learning_rate": 6.773427680323296e-06, + "loss": 0.0043, + "num_input_tokens_seen": 36072064, + "step": 39880 + }, + { + "epoch": 10.526593638643263, + "grad_norm": 0.004411143250763416, + "learning_rate": 6.225209916355112e-06, + "loss": 0.0181, + "num_input_tokens_seen": 36076480, + "step": 39885 + }, + { + "epoch": 10.52791342219876, + "grad_norm": 0.0009953728877007961, + "learning_rate": 5.7001230774123e-06, + "loss": 0.0154, + "num_input_tokens_seen": 36081184, + "step": 39890 + }, + { + "epoch": 10.529233205754256, + "grad_norm": 0.0004567843279801309, + "learning_rate": 5.198167244446772e-06, + "loss": 0.0188, + "num_input_tokens_seen": 36085696, + "step": 39895 + }, + { + "epoch": 10.530552989309752, + "grad_norm": 0.0006796038942411542, + "learning_rate": 4.71934249487993e-06, + "loss": 0.0049, + "num_input_tokens_seen": 36090016, + "step": 39900 + }, + { + "epoch": 10.53187277286525, + "grad_norm": 0.0009217948536388576, + "learning_rate": 4.2636489025527075e-06, + "loss": 0.0275, + "num_input_tokens_seen": 36094368, + "step": 39905 + }, + { + "epoch": 10.533192556420747, + "grad_norm": 0.00020937433873768896, + "learning_rate": 3.831086537742223e-06, + "loss": 0.0087, + "num_input_tokens_seen": 36099008, + "step": 39910 + }, + { + "epoch": 10.534512339976244, + "grad_norm": 0.0033993434626609087, + "learning_rate": 3.4216554671451236e-06, + "loss": 0.0079, + "num_input_tokens_seen": 36103584, + "step": 39915 + }, + { + "epoch": 10.53583212353174, + "grad_norm": 0.002229637000709772, + "learning_rate": 3.035355753894242e-06, + "loss": 0.0123, + "num_input_tokens_seen": 36108000, + "step": 39920 + }, + { + "epoch": 10.537151907087237, + "grad_norm": 0.0006450340151786804, + "learning_rate": 2.6721874575752477e-06, + "loss": 0.0062, + "num_input_tokens_seen": 36112768, + "step": 39925 + }, + { + "epoch": 10.538471690642735, + "grad_norm": 0.0010562088573351502, + "learning_rate": 2.3321506341933418e-06, + "loss": 0.0059, + "num_input_tokens_seen": 36117440, + "step": 39930 + }, + { + "epoch": 10.539791474198232, + "grad_norm": 0.001168692484498024, + "learning_rate": 2.0152453361732546e-06, + "loss": 0.0107, + "num_input_tokens_seen": 36121856, + "step": 39935 + }, + { + "epoch": 10.541111257753728, + "grad_norm": 0.0003735740319825709, + "learning_rate": 1.7214716123925554e-06, + "loss": 0.0182, + "num_input_tokens_seen": 36126304, + "step": 39940 + }, + { + "epoch": 10.542431041309225, + "grad_norm": 0.0005260084290057421, + "learning_rate": 1.4508295081649968e-06, + "loss": 0.0232, + "num_input_tokens_seen": 36130688, + "step": 39945 + }, + { + "epoch": 10.543750824864722, + "grad_norm": 0.001909770886413753, + "learning_rate": 1.2033190652238623e-06, + "loss": 0.0077, + "num_input_tokens_seen": 36135424, + "step": 39950 + }, + { + "epoch": 10.545070608420218, + "grad_norm": 0.00348149542696774, + "learning_rate": 9.78940321721966e-07, + "loss": 0.0106, + "num_input_tokens_seen": 36139776, + "step": 39955 + }, + { + "epoch": 10.546390391975716, + "grad_norm": 0.000723052944522351, + "learning_rate": 7.776933122816132e-07, + "loss": 0.0085, + "num_input_tokens_seen": 36144864, + "step": 39960 + }, + { + "epoch": 10.547710175531213, + "grad_norm": 0.0002241699257865548, + "learning_rate": 5.99578067927986e-07, + "loss": 0.0098, + "num_input_tokens_seen": 36149536, + "step": 39965 + }, + { + "epoch": 10.54902995908671, + "grad_norm": 0.004462388344109058, + "learning_rate": 4.445946161224512e-07, + "loss": 0.0077, + "num_input_tokens_seen": 36154304, + "step": 39970 + }, + { + "epoch": 10.550349742642206, + "grad_norm": 0.0021078966092318296, + "learning_rate": 3.127429807792126e-07, + "loss": 0.0163, + "num_input_tokens_seen": 36158656, + "step": 39975 + }, + { + "epoch": 10.551669526197703, + "grad_norm": 0.0004764001932926476, + "learning_rate": 2.040231822320049e-07, + "loss": 0.0049, + "num_input_tokens_seen": 36163136, + "step": 39980 + }, + { + "epoch": 10.552989309753201, + "grad_norm": 0.001351567916572094, + "learning_rate": 1.1843523723409354e-07, + "loss": 0.0072, + "num_input_tokens_seen": 36167456, + "step": 39985 + }, + { + "epoch": 10.554309093308698, + "grad_norm": 0.000241125890170224, + "learning_rate": 5.597915897492811e-08, + "loss": 0.0072, + "num_input_tokens_seen": 36171968, + "step": 39990 + }, + { + "epoch": 10.555628876864194, + "grad_norm": 0.0006730271270498633, + "learning_rate": 1.6654957113448885e-08, + "loss": 0.0103, + "num_input_tokens_seen": 36176544, + "step": 39995 + }, + { + "epoch": 10.55694866041969, + "grad_norm": 0.0017167185433208942, + "learning_rate": 4.626377114735902e-10, + "loss": 0.0205, + "num_input_tokens_seen": 36181120, + "step": 40000 + }, + { + "epoch": 10.55694866041969, + "eval_loss": 0.11209341138601303, + "eval_runtime": 75.9616, + "eval_samples_per_second": 88.663, + "eval_steps_per_second": 22.169, + "num_input_tokens_seen": 36181120, + "step": 40000 + }, + { + "epoch": 10.55694866041969, + "num_input_tokens_seen": 36181120, + "step": 40000, + "total_flos": 1.5150396743467008e+17, + "train_loss": 0.07485237948787399, + "train_runtime": 31328.1948, + "train_samples_per_second": 20.429, + "train_steps_per_second": 1.277 + } + ], + "logging_steps": 5, + "max_steps": 40000, + "num_input_tokens_seen": 36181120, + "num_train_epochs": 11, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.5150396743467008e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}