{ "best_global_step": 50000, "best_metric": 0.5881, "best_model_checkpoint": "models/NED/SPACCC_full_upsampled_tfidf/Meta-Llama-3-8B-Instruct/checkpoint-50000", "epoch": 3.0, "eval_steps": 2000, "global_step": 103965, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.771104082763195, "epoch": 0.057711729909104025, "grad_norm": 8.875, "learning_rate": 1.9227316447579353e-05, "loss": 0.807, "mean_token_accuracy": 0.8383643639683723, "num_tokens": 15534479.0, "step": 2000 }, { "epoch": 0.057711729909104025, "eval_entropy": 1.7064778925563795, "eval_loss": 0.5752137899398804, "eval_mean_token_accuracy": 0.8736604764762166, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 15534479.0, "eval_recall": 0.4857, "eval_runtime": 19.5541, "eval_samples_per_second": 46.435, "eval_steps_per_second": 46.435, "step": 2000 }, { "entropy": 1.7855026668310165, "epoch": 0.11542345981820805, "grad_norm": 4.9375, "learning_rate": 2.973821470360748e-05, "loss": 0.4284, "mean_token_accuracy": 0.9000137696564198, "num_tokens": 31091376.0, "step": 4000 }, { "epoch": 0.11542345981820805, "eval_entropy": 1.9107416086391205, "eval_loss": 0.6141767501831055, "eval_mean_token_accuracy": 0.8813558152921924, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 31091376.0, "eval_recall": 0.5132, "eval_runtime": 18.6743, "eval_samples_per_second": 48.623, "eval_steps_per_second": 48.623, "step": 4000 }, { "entropy": 1.7454647228717803, "epoch": 0.17313518972731207, "grad_norm": 6.40625, "learning_rate": 2.914324812089721e-05, "loss": 0.3478, "mean_token_accuracy": 0.9183464118242264, "num_tokens": 46695519.0, "step": 6000 }, { "epoch": 0.17313518972731207, "eval_entropy": 1.6889207995279245, "eval_loss": 0.7172139286994934, "eval_mean_token_accuracy": 0.8881947658627831, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 46695519.0, "eval_recall": 0.5551, "eval_runtime": 18.7505, "eval_samples_per_second": 48.425, "eval_steps_per_second": 48.425, "step": 6000 }, { "entropy": 1.686599359869957, "epoch": 0.2308469196364161, "grad_norm": 5.0, "learning_rate": 2.8548281538186937e-05, "loss": 0.2911, "mean_token_accuracy": 0.9297748121023178, "num_tokens": 62201311.0, "step": 8000 }, { "epoch": 0.2308469196364161, "eval_entropy": 1.765830583837589, "eval_loss": 0.6870580315589905, "eval_mean_token_accuracy": 0.8954090231184392, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 62201311.0, "eval_recall": 0.554, "eval_runtime": 18.2183, "eval_samples_per_second": 49.84, "eval_steps_per_second": 49.84, "step": 8000 }, { "entropy": 1.6513705806136132, "epoch": 0.28855864954552013, "grad_norm": 2.890625, "learning_rate": 2.795331495547667e-05, "loss": 0.2723, "mean_token_accuracy": 0.9330688781142235, "num_tokens": 77723725.0, "step": 10000 }, { "epoch": 0.28855864954552013, "eval_entropy": 1.7299244680199855, "eval_loss": 0.671196460723877, "eval_mean_token_accuracy": 0.8996821519003828, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 77723725.0, "eval_recall": 0.5837, "eval_runtime": 18.1693, "eval_samples_per_second": 49.975, "eval_steps_per_second": 49.975, "step": 10000 }, { "entropy": 1.6137347612977029, "epoch": 0.34627037945462413, "grad_norm": 4.15625, "learning_rate": 2.7358348372766396e-05, "loss": 0.247, "mean_token_accuracy": 0.938195524007082, "num_tokens": 93268948.0, "step": 12000 }, { "epoch": 0.34627037945462413, "eval_entropy": 1.642808350631844, "eval_loss": 0.7069945335388184, "eval_mean_token_accuracy": 0.9005183333998735, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 93268948.0, "eval_recall": 0.5683, "eval_runtime": 18.2098, "eval_samples_per_second": 49.863, "eval_steps_per_second": 49.863, "step": 12000 }, { "entropy": 1.5417808018922805, "epoch": 0.4039821093637282, "grad_norm": 6.15625, "learning_rate": 2.6763381790056127e-05, "loss": 0.2369, "mean_token_accuracy": 0.9400165711343288, "num_tokens": 108712209.0, "step": 14000 }, { "epoch": 0.4039821093637282, "eval_entropy": 1.6070995822483223, "eval_loss": 0.7336843013763428, "eval_mean_token_accuracy": 0.9035924484026064, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 108712209.0, "eval_recall": 0.5749, "eval_runtime": 18.1147, "eval_samples_per_second": 50.125, "eval_steps_per_second": 50.125, "step": 14000 }, { "entropy": 1.5307720832824707, "epoch": 0.4616938392728322, "grad_norm": 1.3203125, "learning_rate": 2.6168415207345855e-05, "loss": 0.224, "mean_token_accuracy": 0.9427571404874325, "num_tokens": 124237300.0, "step": 16000 }, { "epoch": 0.4616938392728322, "eval_entropy": 1.6007142412242386, "eval_loss": 0.723686933517456, "eval_mean_token_accuracy": 0.9036177859891878, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 124237300.0, "eval_recall": 0.5738, "eval_runtime": 18.0095, "eval_samples_per_second": 50.418, "eval_steps_per_second": 50.418, "step": 16000 }, { "entropy": 1.5197161840200424, "epoch": 0.5194055691819363, "grad_norm": 4.375, "learning_rate": 2.5573448624635583e-05, "loss": 0.2127, "mean_token_accuracy": 0.9449084457457065, "num_tokens": 139804083.0, "step": 18000 }, { "epoch": 0.5194055691819363, "eval_entropy": 1.5377285603909765, "eval_loss": 0.7464824318885803, "eval_mean_token_accuracy": 0.9009800682968505, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 139804083.0, "eval_recall": 0.5694, "eval_runtime": 17.3142, "eval_samples_per_second": 52.443, "eval_steps_per_second": 52.443, "step": 18000 }, { "entropy": 1.4599213127493857, "epoch": 0.5771172990910403, "grad_norm": 4.96875, "learning_rate": 2.4978482041925314e-05, "loss": 0.2026, "mean_token_accuracy": 0.9471077627837657, "num_tokens": 155379953.0, "step": 20000 }, { "epoch": 0.5771172990910403, "eval_entropy": 1.475688523001608, "eval_loss": 0.762593150138855, "eval_mean_token_accuracy": 0.9033181490089399, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 155379953.0, "eval_recall": 0.5705, "eval_runtime": 17.8659, "eval_samples_per_second": 50.823, "eval_steps_per_second": 50.823, "step": 20000 }, { "entropy": 1.4277473657727242, "epoch": 0.6348290290001443, "grad_norm": 5.15625, "learning_rate": 2.4383515459215042e-05, "loss": 0.1907, "mean_token_accuracy": 0.9498722539842128, "num_tokens": 170982670.0, "step": 22000 }, { "epoch": 0.6348290290001443, "eval_entropy": 1.5171283333837198, "eval_loss": 0.7434535026550293, "eval_mean_token_accuracy": 0.9021543205965983, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 170982670.0, "eval_recall": 0.5727, "eval_runtime": 17.4411, "eval_samples_per_second": 52.061, "eval_steps_per_second": 52.061, "step": 22000 }, { "entropy": 1.4187169399261474, "epoch": 0.6925407589092483, "grad_norm": 2.9375, "learning_rate": 2.378854887650477e-05, "loss": 0.1905, "mean_token_accuracy": 0.9493927232325077, "num_tokens": 186663193.0, "step": 24000 }, { "epoch": 0.6925407589092483, "eval_entropy": 1.4800235901503835, "eval_loss": 0.7626135945320129, "eval_mean_token_accuracy": 0.9030716799298047, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 186663193.0, "eval_recall": 0.5771, "eval_runtime": 17.6742, "eval_samples_per_second": 51.374, "eval_steps_per_second": 51.374, "step": 24000 }, { "entropy": 1.3858990859389304, "epoch": 0.7502524888183524, "grad_norm": 5.4375, "learning_rate": 2.31935822937945e-05, "loss": 0.1847, "mean_token_accuracy": 0.9506744608581066, "num_tokens": 202382574.0, "step": 26000 }, { "epoch": 0.7502524888183524, "eval_entropy": 1.4523340987476483, "eval_loss": 0.7447758316993713, "eval_mean_token_accuracy": 0.9053517018777159, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 202382574.0, "eval_recall": 0.5837, "eval_runtime": 17.5503, "eval_samples_per_second": 51.737, "eval_steps_per_second": 51.737, "step": 26000 }, { "entropy": 1.371607663989067, "epoch": 0.8079642187274564, "grad_norm": 6.46875, "learning_rate": 2.259861571108423e-05, "loss": 0.1747, "mean_token_accuracy": 0.9534294557571411, "num_tokens": 217894933.0, "step": 28000 }, { "epoch": 0.8079642187274564, "eval_entropy": 1.449135780728336, "eval_loss": 0.726492166519165, "eval_mean_token_accuracy": 0.9063065528607054, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 217894933.0, "eval_recall": 0.5782, "eval_runtime": 17.4046, "eval_samples_per_second": 52.17, "eval_steps_per_second": 52.17, "step": 28000 }, { "entropy": 1.3474767149090767, "epoch": 0.8656759486365604, "grad_norm": 1.15625, "learning_rate": 2.2003649128373957e-05, "loss": 0.1662, "mean_token_accuracy": 0.9552805411219597, "num_tokens": 233442927.0, "step": 30000 }, { "epoch": 0.8656759486365604, "eval_entropy": 1.4218564205495272, "eval_loss": 0.7416213750839233, "eval_mean_token_accuracy": 0.9069001922129535, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 233442927.0, "eval_recall": 0.587, "eval_runtime": 17.6827, "eval_samples_per_second": 51.35, "eval_steps_per_second": 51.35, "step": 30000 }, { "entropy": 1.337467650592327, "epoch": 0.9233876785456644, "grad_norm": 4.375, "learning_rate": 2.1408682545663684e-05, "loss": 0.1637, "mean_token_accuracy": 0.9557446602284908, "num_tokens": 249071659.0, "step": 32000 }, { "epoch": 0.9233876785456644, "eval_entropy": 1.3889587524835234, "eval_loss": 0.7390624284744263, "eval_mean_token_accuracy": 0.9073679444876537, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 249071659.0, "eval_recall": 0.5848, "eval_runtime": 17.5032, "eval_samples_per_second": 51.876, "eval_steps_per_second": 51.876, "step": 32000 }, { "entropy": 1.3001156712770463, "epoch": 0.9810994084547684, "grad_norm": 3.890625, "learning_rate": 2.0813715962953412e-05, "loss": 0.1557, "mean_token_accuracy": 0.9573536138236522, "num_tokens": 264672169.0, "step": 34000 }, { "epoch": 0.9810994084547684, "eval_entropy": 1.4234498099214705, "eval_loss": 0.7484801411628723, "eval_mean_token_accuracy": 0.9075630426012997, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 264672169.0, "eval_recall": 0.5859, "eval_runtime": 16.2686, "eval_samples_per_second": 55.813, "eval_steps_per_second": 55.813, "step": 34000 }, { "entropy": 1.2362971892952919, "epoch": 1.0388111383638725, "grad_norm": 4.96875, "learning_rate": 2.0218749380243143e-05, "loss": 0.1179, "mean_token_accuracy": 0.9659893708825111, "num_tokens": 280288735.0, "step": 36000 }, { "epoch": 1.0388111383638725, "eval_entropy": 1.268248316540592, "eval_loss": 0.7719414830207825, "eval_mean_token_accuracy": 0.9055645169796923, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 280288735.0, "eval_recall": 0.5771, "eval_runtime": 16.2819, "eval_samples_per_second": 55.767, "eval_steps_per_second": 55.767, "step": 36000 }, { "entropy": 1.1861163977086544, "epoch": 1.0965228682729764, "grad_norm": 2.703125, "learning_rate": 1.962378279753287e-05, "loss": 0.1001, "mean_token_accuracy": 0.9700573923885822, "num_tokens": 295889252.0, "step": 38000 }, { "epoch": 1.0965228682729764, "eval_entropy": 1.2859099207339308, "eval_loss": 0.7740228176116943, "eval_mean_token_accuracy": 0.9053704237425905, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 295889252.0, "eval_recall": 0.5815, "eval_runtime": 17.2227, "eval_samples_per_second": 52.721, "eval_steps_per_second": 52.721, "step": 38000 }, { "entropy": 1.172551353752613, "epoch": 1.1542345981820805, "grad_norm": 5.09375, "learning_rate": 1.90288162148226e-05, "loss": 0.0999, "mean_token_accuracy": 0.9701334120929241, "num_tokens": 311460812.0, "step": 40000 }, { "epoch": 1.1542345981820805, "eval_entropy": 1.2553664291613953, "eval_loss": 0.7721803784370422, "eval_mean_token_accuracy": 0.9035136323537071, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 311460812.0, "eval_recall": 0.576, "eval_runtime": 16.7852, "eval_samples_per_second": 54.095, "eval_steps_per_second": 54.095, "step": 40000 }, { "entropy": 1.1619984501898288, "epoch": 1.2119463280911846, "grad_norm": 2.21875, "learning_rate": 1.843384963211233e-05, "loss": 0.0965, "mean_token_accuracy": 0.9710877353549003, "num_tokens": 327135410.0, "step": 42000 }, { "epoch": 1.2119463280911846, "eval_entropy": 1.2579897131426219, "eval_loss": 0.7692885398864746, "eval_mean_token_accuracy": 0.9031982754033042, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 327135410.0, "eval_recall": 0.576, "eval_runtime": 17.2218, "eval_samples_per_second": 52.724, "eval_steps_per_second": 52.724, "step": 42000 }, { "entropy": 1.1411213338077069, "epoch": 1.2696580580002885, "grad_norm": 3.578125, "learning_rate": 1.7838883049402058e-05, "loss": 0.0944, "mean_token_accuracy": 0.9715765230953693, "num_tokens": 342774177.0, "step": 44000 }, { "epoch": 1.2696580580002885, "eval_entropy": 1.253242742319464, "eval_loss": 0.7519774436950684, "eval_mean_token_accuracy": 0.9045354708175827, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 342774177.0, "eval_recall": 0.576, "eval_runtime": 17.1793, "eval_samples_per_second": 52.854, "eval_steps_per_second": 52.854, "step": 44000 }, { "entropy": 1.1595853001475334, "epoch": 1.3273697879093926, "grad_norm": 5.5, "learning_rate": 1.724391646669179e-05, "loss": 0.0963, "mean_token_accuracy": 0.9711391851603984, "num_tokens": 358312922.0, "step": 46000 }, { "epoch": 1.3273697879093926, "eval_entropy": 1.2453804171689282, "eval_loss": 0.7676454186439514, "eval_mean_token_accuracy": 0.9064169454876547, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 358312922.0, "eval_recall": 0.5837, "eval_runtime": 16.8037, "eval_samples_per_second": 54.036, "eval_steps_per_second": 54.036, "step": 46000 }, { "entropy": 1.1609133576154709, "epoch": 1.3850815178184965, "grad_norm": 4.03125, "learning_rate": 1.6648949883981517e-05, "loss": 0.0922, "mean_token_accuracy": 0.9723608312606812, "num_tokens": 373752333.0, "step": 48000 }, { "epoch": 1.3850815178184965, "eval_entropy": 1.2345776492934921, "eval_loss": 0.7665285468101501, "eval_mean_token_accuracy": 0.9063460667687365, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 373752333.0, "eval_recall": 0.5793, "eval_runtime": 16.7578, "eval_samples_per_second": 54.184, "eval_steps_per_second": 54.184, "step": 48000 }, { "entropy": 1.1655547478497028, "epoch": 1.4427932477276006, "grad_norm": 5.59375, "learning_rate": 1.6053983301271245e-05, "loss": 0.094, "mean_token_accuracy": 0.9717481000125409, "num_tokens": 389447345.0, "step": 50000 }, { "epoch": 1.4427932477276006, "eval_entropy": 1.2292915042407713, "eval_loss": 0.7735024094581604, "eval_mean_token_accuracy": 0.907910385517822, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 389447345.0, "eval_recall": 0.5881, "eval_runtime": 17.3697, "eval_samples_per_second": 52.275, "eval_steps_per_second": 52.275, "step": 50000 }, { "entropy": 1.1552352701127528, "epoch": 1.5005049776367048, "grad_norm": 3.0, "learning_rate": 1.5459016718560976e-05, "loss": 0.091, "mean_token_accuracy": 0.9726284679472447, "num_tokens": 404935652.0, "step": 52000 }, { "epoch": 1.5005049776367048, "eval_entropy": 1.2490241264325406, "eval_loss": 0.7779573202133179, "eval_mean_token_accuracy": 0.9046718338053132, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 404935652.0, "eval_recall": 0.5848, "eval_runtime": 17.2173, "eval_samples_per_second": 52.738, "eval_steps_per_second": 52.738, "step": 52000 }, { "entropy": 1.1580015743076801, "epoch": 1.5582167075458087, "grad_norm": 0.0019989013671875, "learning_rate": 1.4864050135850704e-05, "loss": 0.0674, "mean_token_accuracy": 0.979576114565134, "num_tokens": 15533221.0, "step": 54000 }, { "epoch": 1.5582167075458087, "eval_entropy": 1.204221866138706, "eval_loss": 0.8085830211639404, "eval_mean_token_accuracy": 0.9048162211668124, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 15533221.0, "eval_recall": 0.5738, "eval_runtime": 17.1435, "eval_samples_per_second": 52.965, "eval_steps_per_second": 52.965, "step": 54000 }, { "entropy": 1.144241349697113, "epoch": 1.6159284374549125, "grad_norm": 3.765625, "learning_rate": 1.4269083553140432e-05, "loss": 0.0633, "mean_token_accuracy": 0.9807874869704246, "num_tokens": 31150685.0, "step": 56000 }, { "epoch": 1.6159284374549125, "eval_entropy": 1.2577752770306256, "eval_loss": 0.8108322024345398, "eval_mean_token_accuracy": 0.904205797002179, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 31150685.0, "eval_recall": 0.5738, "eval_runtime": 16.4274, "eval_samples_per_second": 55.274, "eval_steps_per_second": 55.274, "step": 56000 }, { "entropy": 1.162702257514, "epoch": 1.6736401673640167, "grad_norm": 2.8125, "learning_rate": 1.3674116970430161e-05, "loss": 0.0665, "mean_token_accuracy": 0.9797295650243759, "num_tokens": 46832332.0, "step": 58000 }, { "epoch": 1.6736401673640167, "eval_entropy": 1.268515376989537, "eval_loss": 0.814584493637085, "eval_mean_token_accuracy": 0.904229478295154, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 46832332.0, "eval_recall": 0.5716, "eval_runtime": 16.4853, "eval_samples_per_second": 55.079, "eval_steps_per_second": 55.079, "step": 58000 }, { "entropy": 1.1661596206724645, "epoch": 1.7313518972731208, "grad_norm": 5.96875, "learning_rate": 1.3079150387719889e-05, "loss": 0.0672, "mean_token_accuracy": 0.9796462517380714, "num_tokens": 62558817.0, "step": 60000 }, { "epoch": 1.7313518972731208, "eval_entropy": 1.2622852631996382, "eval_loss": 0.8227198123931885, "eval_mean_token_accuracy": 0.9038923141846048, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 62558817.0, "eval_recall": 0.5716, "eval_runtime": 16.9948, "eval_samples_per_second": 53.428, "eval_steps_per_second": 53.428, "step": 60000 }, { "entropy": 1.1797457176148891, "epoch": 1.789063627182225, "grad_norm": 0.5546875, "learning_rate": 1.2484183805009618e-05, "loss": 0.0657, "mean_token_accuracy": 0.980204150468111, "num_tokens": 78074806.0, "step": 62000 }, { "epoch": 1.789063627182225, "eval_entropy": 1.2418163208052975, "eval_loss": 0.8185028433799744, "eval_mean_token_accuracy": 0.9041991046740621, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 78074806.0, "eval_recall": 0.5705, "eval_runtime": 17.1144, "eval_samples_per_second": 53.055, "eval_steps_per_second": 53.055, "step": 62000 }, { "entropy": 1.1774089051187038, "epoch": 1.8467753570913288, "grad_norm": 5.5625, "learning_rate": 1.1889217222299348e-05, "loss": 0.0649, "mean_token_accuracy": 0.9804997465908527, "num_tokens": 93602629.0, "step": 64000 }, { "epoch": 1.8467753570913288, "eval_entropy": 1.2988805646807087, "eval_loss": 0.8260899782180786, "eval_mean_token_accuracy": 0.9030656689523601, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 93602629.0, "eval_recall": 0.576, "eval_runtime": 16.1643, "eval_samples_per_second": 56.173, "eval_steps_per_second": 56.173, "step": 64000 }, { "entropy": 1.174987347126007, "epoch": 1.9044870870004327, "grad_norm": 5.53125, "learning_rate": 1.1294250639589077e-05, "loss": 0.064, "mean_token_accuracy": 0.9806980607807636, "num_tokens": 109249414.0, "step": 66000 }, { "epoch": 1.9044870870004327, "eval_entropy": 1.2433809736489199, "eval_loss": 0.8272661566734314, "eval_mean_token_accuracy": 0.9028221254569319, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 109249414.0, "eval_recall": 0.5727, "eval_runtime": 16.3988, "eval_samples_per_second": 55.37, "eval_steps_per_second": 55.37, "step": 66000 }, { "entropy": 1.1633582679629326, "epoch": 1.9621988169095368, "grad_norm": 5.34375, "learning_rate": 1.0699284056878807e-05, "loss": 0.0643, "mean_token_accuracy": 0.9805754337012768, "num_tokens": 124880720.0, "step": 68000 }, { "epoch": 1.9621988169095368, "eval_entropy": 1.224490842367584, "eval_loss": 0.8288715481758118, "eval_mean_token_accuracy": 0.9034351931991557, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 124880720.0, "eval_recall": 0.5738, "eval_runtime": 16.4997, "eval_samples_per_second": 55.031, "eval_steps_per_second": 55.031, "step": 68000 }, { "entropy": 1.1513627296090125, "epoch": 2.019910546818641, "grad_norm": 0.79296875, "learning_rate": 1.0104317474168535e-05, "loss": 0.0633, "mean_token_accuracy": 0.9811660476624966, "num_tokens": 140499220.0, "step": 70000 }, { "epoch": 2.019910546818641, "eval_entropy": 1.2267822175561593, "eval_loss": 0.8458257913589478, "eval_mean_token_accuracy": 0.9038964834572986, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 140499220.0, "eval_recall": 0.5705, "eval_runtime": 16.4967, "eval_samples_per_second": 55.041, "eval_steps_per_second": 55.041, "step": 70000 }, { "entropy": 1.143776093840599, "epoch": 2.077622276727745, "grad_norm": 7.9375, "learning_rate": 9.509350891458264e-06, "loss": 0.0597, "mean_token_accuracy": 0.9825106913745403, "num_tokens": 156048918.0, "step": 72000 }, { "epoch": 2.077622276727745, "eval_entropy": 1.2148328015195116, "eval_loss": 0.8337165713310242, "eval_mean_token_accuracy": 0.9035390550475814, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 156048918.0, "eval_recall": 0.5705, "eval_runtime": 16.506, "eval_samples_per_second": 55.01, "eval_steps_per_second": 55.01, "step": 72000 }, { "entropy": 1.1460822140574456, "epoch": 2.135334006636849, "grad_norm": 12.375, "learning_rate": 8.914384308747992e-06, "loss": 0.0596, "mean_token_accuracy": 0.98244061678648, "num_tokens": 171653895.0, "step": 74000 }, { "epoch": 2.135334006636849, "eval_entropy": 1.2635613490175046, "eval_loss": 0.8348618745803833, "eval_mean_token_accuracy": 0.9038379774285308, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 171653895.0, "eval_recall": 0.5694, "eval_runtime": 16.4822, "eval_samples_per_second": 55.09, "eval_steps_per_second": 55.09, "step": 74000 }, { "entropy": 1.1560133908391, "epoch": 2.193045736545953, "grad_norm": 7.625, "learning_rate": 8.319417726037721e-06, "loss": 0.06, "mean_token_accuracy": 0.9822552761137485, "num_tokens": 187228261.0, "step": 76000 }, { "epoch": 2.193045736545953, "eval_entropy": 1.2220293277554575, "eval_loss": 0.8315507769584656, "eval_mean_token_accuracy": 0.9036543207809263, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 187228261.0, "eval_recall": 0.5716, "eval_runtime": 16.703, "eval_samples_per_second": 54.361, "eval_steps_per_second": 54.361, "step": 76000 }, { "entropy": 1.1676500248610973, "epoch": 2.250757466455057, "grad_norm": 4.84375, "learning_rate": 7.72445114332745e-06, "loss": 0.0611, "mean_token_accuracy": 0.9819406977891922, "num_tokens": 202699683.0, "step": 78000 }, { "epoch": 2.250757466455057, "eval_entropy": 1.2448954319638827, "eval_loss": 0.8309385776519775, "eval_mean_token_accuracy": 0.9030922418255113, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 202699683.0, "eval_recall": 0.5694, "eval_runtime": 16.603, "eval_samples_per_second": 54.689, "eval_steps_per_second": 54.689, "step": 78000 }, { "entropy": 1.1656713368594647, "epoch": 2.308469196364161, "grad_norm": 6.53125, "learning_rate": 7.129484560617179e-06, "loss": 0.0618, "mean_token_accuracy": 0.9817487963140011, "num_tokens": 218284466.0, "step": 80000 }, { "epoch": 2.308469196364161, "eval_entropy": 1.255102663181952, "eval_loss": 0.8435425162315369, "eval_mean_token_accuracy": 0.902260869642974, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 218284466.0, "eval_recall": 0.5661, "eval_runtime": 16.8256, "eval_samples_per_second": 53.965, "eval_steps_per_second": 53.965, "step": 80000 }, { "entropy": 1.1597592905461789, "epoch": 2.366180926273265, "grad_norm": 2.5, "learning_rate": 6.534517977906908e-06, "loss": 0.0602, "mean_token_accuracy": 0.9821576415896416, "num_tokens": 233928452.0, "step": 82000 }, { "epoch": 2.366180926273265, "eval_entropy": 1.2422783964924875, "eval_loss": 0.8390738368034363, "eval_mean_token_accuracy": 0.9032785006950605, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 233928452.0, "eval_recall": 0.5683, "eval_runtime": 16.7577, "eval_samples_per_second": 54.184, "eval_steps_per_second": 54.184, "step": 82000 }, { "entropy": 1.17008468157053, "epoch": 2.4238926561823693, "grad_norm": 0.0400390625, "learning_rate": 5.939551395196637e-06, "loss": 0.0591, "mean_token_accuracy": 0.9825585896968841, "num_tokens": 249419664.0, "step": 84000 }, { "epoch": 2.4238926561823693, "eval_entropy": 1.2469606770424067, "eval_loss": 0.8383654356002808, "eval_mean_token_accuracy": 0.9040639832418921, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 249419664.0, "eval_recall": 0.5705, "eval_runtime": 16.6392, "eval_samples_per_second": 54.57, "eval_steps_per_second": 54.57, "step": 84000 }, { "entropy": 1.163529093414545, "epoch": 2.481604386091473, "grad_norm": 6.28125, "learning_rate": 5.3445848124863655e-06, "loss": 0.0568, "mean_token_accuracy": 0.9832313210368157, "num_tokens": 264982654.0, "step": 86000 }, { "epoch": 2.481604386091473, "eval_entropy": 1.236849331908289, "eval_loss": 0.8381890058517456, "eval_mean_token_accuracy": 0.9027883698630438, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 264982654.0, "eval_recall": 0.5672, "eval_runtime": 16.5964, "eval_samples_per_second": 54.711, "eval_steps_per_second": 54.711, "step": 86000 }, { "entropy": 1.1701532056927682, "epoch": 2.539316116000577, "grad_norm": 0.87109375, "learning_rate": 4.749618229776094e-06, "loss": 0.0574, "mean_token_accuracy": 0.9830155865848065, "num_tokens": 280520807.0, "step": 88000 }, { "epoch": 2.539316116000577, "eval_entropy": 1.2492524392673097, "eval_loss": 0.839518666267395, "eval_mean_token_accuracy": 0.9025986767681685, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 280520807.0, "eval_recall": 0.5661, "eval_runtime": 16.5519, "eval_samples_per_second": 54.858, "eval_steps_per_second": 54.858, "step": 88000 }, { "entropy": 1.167941878914833, "epoch": 2.597027845909681, "grad_norm": 0.451171875, "learning_rate": 4.154651647065824e-06, "loss": 0.0602, "mean_token_accuracy": 0.9820802296400071, "num_tokens": 296146535.0, "step": 90000 }, { "epoch": 2.597027845909681, "eval_entropy": 1.2443812186234848, "eval_loss": 0.8395401835441589, "eval_mean_token_accuracy": 0.9034286766981764, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 296146535.0, "eval_recall": 0.5683, "eval_runtime": 16.788, "eval_samples_per_second": 54.086, "eval_steps_per_second": 54.086, "step": 90000 }, { "entropy": 1.1601335457861424, "epoch": 2.6547395758187853, "grad_norm": 4.6875, "learning_rate": 3.559685064355552e-06, "loss": 0.0584, "mean_token_accuracy": 0.9827592859268188, "num_tokens": 311778551.0, "step": 92000 }, { "epoch": 2.6547395758187853, "eval_entropy": 1.2437387075324415, "eval_loss": 0.836577296257019, "eval_mean_token_accuracy": 0.9039776291419231, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 311778551.0, "eval_recall": 0.5705, "eval_runtime": 16.2272, "eval_samples_per_second": 55.956, "eval_steps_per_second": 55.956, "step": 92000 }, { "entropy": 1.1733056641221047, "epoch": 2.712451305727889, "grad_norm": 5.03125, "learning_rate": 2.964718481645281e-06, "loss": 0.0564, "mean_token_accuracy": 0.9832823853492737, "num_tokens": 327170479.0, "step": 94000 }, { "epoch": 2.712451305727889, "eval_entropy": 1.2440849004337966, "eval_loss": 0.8399211168289185, "eval_mean_token_accuracy": 0.9033104040155326, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 327170479.0, "eval_recall": 0.5683, "eval_runtime": 16.2233, "eval_samples_per_second": 55.969, "eval_steps_per_second": 55.969, "step": 94000 }, { "entropy": 1.1586334483027458, "epoch": 2.770163035636993, "grad_norm": 3.953125, "learning_rate": 2.36975189893501e-06, "loss": 0.0585, "mean_token_accuracy": 0.9826480825543403, "num_tokens": 342791353.0, "step": 96000 }, { "epoch": 2.770163035636993, "eval_entropy": 1.2412338042180444, "eval_loss": 0.8378188610076904, "eval_mean_token_accuracy": 0.9035194405572005, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 342791353.0, "eval_recall": 0.5672, "eval_runtime": 16.2077, "eval_samples_per_second": 56.023, "eval_steps_per_second": 56.023, "step": 96000 }, { "entropy": 1.1629991734027862, "epoch": 2.827874765546097, "grad_norm": 6.4375, "learning_rate": 1.7747853162247388e-06, "loss": 0.0608, "mean_token_accuracy": 0.9821404512822628, "num_tokens": 358436354.0, "step": 98000 }, { "epoch": 2.827874765546097, "eval_entropy": 1.2435034370369848, "eval_loss": 0.8380420207977295, "eval_mean_token_accuracy": 0.9037704004327631, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 358436354.0, "eval_recall": 0.5694, "eval_runtime": 16.2198, "eval_samples_per_second": 55.981, "eval_steps_per_second": 55.981, "step": 98000 }, { "entropy": 1.1640874392092229, "epoch": 2.8855864954552013, "grad_norm": 0.8203125, "learning_rate": 1.1798187335144677e-06, "loss": 0.0574, "mean_token_accuracy": 0.9829988768994808, "num_tokens": 374029027.0, "step": 100000 }, { "epoch": 2.8855864954552013, "eval_entropy": 1.2438825091207606, "eval_loss": 0.8370459079742432, "eval_mean_token_accuracy": 0.9030486140810445, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 374029027.0, "eval_recall": 0.5661, "eval_runtime": 17.3712, "eval_samples_per_second": 52.271, "eval_steps_per_second": 52.271, "step": 100000 }, { "entropy": 1.1683570961356162, "epoch": 2.9432982253643054, "grad_norm": 6.65625, "learning_rate": 5.848521508041964e-07, "loss": 0.0583, "mean_token_accuracy": 0.982835016399622, "num_tokens": 389554889.0, "step": 102000 }, { "epoch": 2.9432982253643054, "eval_entropy": 1.2444104566710636, "eval_loss": 0.8376456499099731, "eval_mean_token_accuracy": 0.9037148623608282, "eval_num_gold": 908, "eval_num_guess": 908, "eval_num_tokens": 389554889.0, "eval_recall": 0.5683, "eval_runtime": 16.2408, "eval_samples_per_second": 55.909, "eval_steps_per_second": 55.909, "step": 102000 } ], "logging_steps": 2000, "max_steps": 103965, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6466469785747587e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }