SynCABEL_SPACCC / trainer_state.json
AnonymousARR42's picture
Upload trainer_state.json with huggingface_hub
22699df verified
{
"best_global_step": 50000,
"best_metric": 0.5881,
"best_model_checkpoint": "models/NED/SPACCC_full_upsampled_tfidf/Meta-Llama-3-8B-Instruct/checkpoint-50000",
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 103965,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.771104082763195,
"epoch": 0.057711729909104025,
"grad_norm": 8.875,
"learning_rate": 1.9227316447579353e-05,
"loss": 0.807,
"mean_token_accuracy": 0.8383643639683723,
"num_tokens": 15534479.0,
"step": 2000
},
{
"epoch": 0.057711729909104025,
"eval_entropy": 1.7064778925563795,
"eval_loss": 0.5752137899398804,
"eval_mean_token_accuracy": 0.8736604764762166,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 15534479.0,
"eval_recall": 0.4857,
"eval_runtime": 19.5541,
"eval_samples_per_second": 46.435,
"eval_steps_per_second": 46.435,
"step": 2000
},
{
"entropy": 1.7855026668310165,
"epoch": 0.11542345981820805,
"grad_norm": 4.9375,
"learning_rate": 2.973821470360748e-05,
"loss": 0.4284,
"mean_token_accuracy": 0.9000137696564198,
"num_tokens": 31091376.0,
"step": 4000
},
{
"epoch": 0.11542345981820805,
"eval_entropy": 1.9107416086391205,
"eval_loss": 0.6141767501831055,
"eval_mean_token_accuracy": 0.8813558152921924,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 31091376.0,
"eval_recall": 0.5132,
"eval_runtime": 18.6743,
"eval_samples_per_second": 48.623,
"eval_steps_per_second": 48.623,
"step": 4000
},
{
"entropy": 1.7454647228717803,
"epoch": 0.17313518972731207,
"grad_norm": 6.40625,
"learning_rate": 2.914324812089721e-05,
"loss": 0.3478,
"mean_token_accuracy": 0.9183464118242264,
"num_tokens": 46695519.0,
"step": 6000
},
{
"epoch": 0.17313518972731207,
"eval_entropy": 1.6889207995279245,
"eval_loss": 0.7172139286994934,
"eval_mean_token_accuracy": 0.8881947658627831,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 46695519.0,
"eval_recall": 0.5551,
"eval_runtime": 18.7505,
"eval_samples_per_second": 48.425,
"eval_steps_per_second": 48.425,
"step": 6000
},
{
"entropy": 1.686599359869957,
"epoch": 0.2308469196364161,
"grad_norm": 5.0,
"learning_rate": 2.8548281538186937e-05,
"loss": 0.2911,
"mean_token_accuracy": 0.9297748121023178,
"num_tokens": 62201311.0,
"step": 8000
},
{
"epoch": 0.2308469196364161,
"eval_entropy": 1.765830583837589,
"eval_loss": 0.6870580315589905,
"eval_mean_token_accuracy": 0.8954090231184392,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 62201311.0,
"eval_recall": 0.554,
"eval_runtime": 18.2183,
"eval_samples_per_second": 49.84,
"eval_steps_per_second": 49.84,
"step": 8000
},
{
"entropy": 1.6513705806136132,
"epoch": 0.28855864954552013,
"grad_norm": 2.890625,
"learning_rate": 2.795331495547667e-05,
"loss": 0.2723,
"mean_token_accuracy": 0.9330688781142235,
"num_tokens": 77723725.0,
"step": 10000
},
{
"epoch": 0.28855864954552013,
"eval_entropy": 1.7299244680199855,
"eval_loss": 0.671196460723877,
"eval_mean_token_accuracy": 0.8996821519003828,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 77723725.0,
"eval_recall": 0.5837,
"eval_runtime": 18.1693,
"eval_samples_per_second": 49.975,
"eval_steps_per_second": 49.975,
"step": 10000
},
{
"entropy": 1.6137347612977029,
"epoch": 0.34627037945462413,
"grad_norm": 4.15625,
"learning_rate": 2.7358348372766396e-05,
"loss": 0.247,
"mean_token_accuracy": 0.938195524007082,
"num_tokens": 93268948.0,
"step": 12000
},
{
"epoch": 0.34627037945462413,
"eval_entropy": 1.642808350631844,
"eval_loss": 0.7069945335388184,
"eval_mean_token_accuracy": 0.9005183333998735,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 93268948.0,
"eval_recall": 0.5683,
"eval_runtime": 18.2098,
"eval_samples_per_second": 49.863,
"eval_steps_per_second": 49.863,
"step": 12000
},
{
"entropy": 1.5417808018922805,
"epoch": 0.4039821093637282,
"grad_norm": 6.15625,
"learning_rate": 2.6763381790056127e-05,
"loss": 0.2369,
"mean_token_accuracy": 0.9400165711343288,
"num_tokens": 108712209.0,
"step": 14000
},
{
"epoch": 0.4039821093637282,
"eval_entropy": 1.6070995822483223,
"eval_loss": 0.7336843013763428,
"eval_mean_token_accuracy": 0.9035924484026064,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 108712209.0,
"eval_recall": 0.5749,
"eval_runtime": 18.1147,
"eval_samples_per_second": 50.125,
"eval_steps_per_second": 50.125,
"step": 14000
},
{
"entropy": 1.5307720832824707,
"epoch": 0.4616938392728322,
"grad_norm": 1.3203125,
"learning_rate": 2.6168415207345855e-05,
"loss": 0.224,
"mean_token_accuracy": 0.9427571404874325,
"num_tokens": 124237300.0,
"step": 16000
},
{
"epoch": 0.4616938392728322,
"eval_entropy": 1.6007142412242386,
"eval_loss": 0.723686933517456,
"eval_mean_token_accuracy": 0.9036177859891878,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 124237300.0,
"eval_recall": 0.5738,
"eval_runtime": 18.0095,
"eval_samples_per_second": 50.418,
"eval_steps_per_second": 50.418,
"step": 16000
},
{
"entropy": 1.5197161840200424,
"epoch": 0.5194055691819363,
"grad_norm": 4.375,
"learning_rate": 2.5573448624635583e-05,
"loss": 0.2127,
"mean_token_accuracy": 0.9449084457457065,
"num_tokens": 139804083.0,
"step": 18000
},
{
"epoch": 0.5194055691819363,
"eval_entropy": 1.5377285603909765,
"eval_loss": 0.7464824318885803,
"eval_mean_token_accuracy": 0.9009800682968505,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 139804083.0,
"eval_recall": 0.5694,
"eval_runtime": 17.3142,
"eval_samples_per_second": 52.443,
"eval_steps_per_second": 52.443,
"step": 18000
},
{
"entropy": 1.4599213127493857,
"epoch": 0.5771172990910403,
"grad_norm": 4.96875,
"learning_rate": 2.4978482041925314e-05,
"loss": 0.2026,
"mean_token_accuracy": 0.9471077627837657,
"num_tokens": 155379953.0,
"step": 20000
},
{
"epoch": 0.5771172990910403,
"eval_entropy": 1.475688523001608,
"eval_loss": 0.762593150138855,
"eval_mean_token_accuracy": 0.9033181490089399,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 155379953.0,
"eval_recall": 0.5705,
"eval_runtime": 17.8659,
"eval_samples_per_second": 50.823,
"eval_steps_per_second": 50.823,
"step": 20000
},
{
"entropy": 1.4277473657727242,
"epoch": 0.6348290290001443,
"grad_norm": 5.15625,
"learning_rate": 2.4383515459215042e-05,
"loss": 0.1907,
"mean_token_accuracy": 0.9498722539842128,
"num_tokens": 170982670.0,
"step": 22000
},
{
"epoch": 0.6348290290001443,
"eval_entropy": 1.5171283333837198,
"eval_loss": 0.7434535026550293,
"eval_mean_token_accuracy": 0.9021543205965983,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 170982670.0,
"eval_recall": 0.5727,
"eval_runtime": 17.4411,
"eval_samples_per_second": 52.061,
"eval_steps_per_second": 52.061,
"step": 22000
},
{
"entropy": 1.4187169399261474,
"epoch": 0.6925407589092483,
"grad_norm": 2.9375,
"learning_rate": 2.378854887650477e-05,
"loss": 0.1905,
"mean_token_accuracy": 0.9493927232325077,
"num_tokens": 186663193.0,
"step": 24000
},
{
"epoch": 0.6925407589092483,
"eval_entropy": 1.4800235901503835,
"eval_loss": 0.7626135945320129,
"eval_mean_token_accuracy": 0.9030716799298047,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 186663193.0,
"eval_recall": 0.5771,
"eval_runtime": 17.6742,
"eval_samples_per_second": 51.374,
"eval_steps_per_second": 51.374,
"step": 24000
},
{
"entropy": 1.3858990859389304,
"epoch": 0.7502524888183524,
"grad_norm": 5.4375,
"learning_rate": 2.31935822937945e-05,
"loss": 0.1847,
"mean_token_accuracy": 0.9506744608581066,
"num_tokens": 202382574.0,
"step": 26000
},
{
"epoch": 0.7502524888183524,
"eval_entropy": 1.4523340987476483,
"eval_loss": 0.7447758316993713,
"eval_mean_token_accuracy": 0.9053517018777159,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 202382574.0,
"eval_recall": 0.5837,
"eval_runtime": 17.5503,
"eval_samples_per_second": 51.737,
"eval_steps_per_second": 51.737,
"step": 26000
},
{
"entropy": 1.371607663989067,
"epoch": 0.8079642187274564,
"grad_norm": 6.46875,
"learning_rate": 2.259861571108423e-05,
"loss": 0.1747,
"mean_token_accuracy": 0.9534294557571411,
"num_tokens": 217894933.0,
"step": 28000
},
{
"epoch": 0.8079642187274564,
"eval_entropy": 1.449135780728336,
"eval_loss": 0.726492166519165,
"eval_mean_token_accuracy": 0.9063065528607054,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 217894933.0,
"eval_recall": 0.5782,
"eval_runtime": 17.4046,
"eval_samples_per_second": 52.17,
"eval_steps_per_second": 52.17,
"step": 28000
},
{
"entropy": 1.3474767149090767,
"epoch": 0.8656759486365604,
"grad_norm": 1.15625,
"learning_rate": 2.2003649128373957e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9552805411219597,
"num_tokens": 233442927.0,
"step": 30000
},
{
"epoch": 0.8656759486365604,
"eval_entropy": 1.4218564205495272,
"eval_loss": 0.7416213750839233,
"eval_mean_token_accuracy": 0.9069001922129535,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 233442927.0,
"eval_recall": 0.587,
"eval_runtime": 17.6827,
"eval_samples_per_second": 51.35,
"eval_steps_per_second": 51.35,
"step": 30000
},
{
"entropy": 1.337467650592327,
"epoch": 0.9233876785456644,
"grad_norm": 4.375,
"learning_rate": 2.1408682545663684e-05,
"loss": 0.1637,
"mean_token_accuracy": 0.9557446602284908,
"num_tokens": 249071659.0,
"step": 32000
},
{
"epoch": 0.9233876785456644,
"eval_entropy": 1.3889587524835234,
"eval_loss": 0.7390624284744263,
"eval_mean_token_accuracy": 0.9073679444876537,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 249071659.0,
"eval_recall": 0.5848,
"eval_runtime": 17.5032,
"eval_samples_per_second": 51.876,
"eval_steps_per_second": 51.876,
"step": 32000
},
{
"entropy": 1.3001156712770463,
"epoch": 0.9810994084547684,
"grad_norm": 3.890625,
"learning_rate": 2.0813715962953412e-05,
"loss": 0.1557,
"mean_token_accuracy": 0.9573536138236522,
"num_tokens": 264672169.0,
"step": 34000
},
{
"epoch": 0.9810994084547684,
"eval_entropy": 1.4234498099214705,
"eval_loss": 0.7484801411628723,
"eval_mean_token_accuracy": 0.9075630426012997,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 264672169.0,
"eval_recall": 0.5859,
"eval_runtime": 16.2686,
"eval_samples_per_second": 55.813,
"eval_steps_per_second": 55.813,
"step": 34000
},
{
"entropy": 1.2362971892952919,
"epoch": 1.0388111383638725,
"grad_norm": 4.96875,
"learning_rate": 2.0218749380243143e-05,
"loss": 0.1179,
"mean_token_accuracy": 0.9659893708825111,
"num_tokens": 280288735.0,
"step": 36000
},
{
"epoch": 1.0388111383638725,
"eval_entropy": 1.268248316540592,
"eval_loss": 0.7719414830207825,
"eval_mean_token_accuracy": 0.9055645169796923,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 280288735.0,
"eval_recall": 0.5771,
"eval_runtime": 16.2819,
"eval_samples_per_second": 55.767,
"eval_steps_per_second": 55.767,
"step": 36000
},
{
"entropy": 1.1861163977086544,
"epoch": 1.0965228682729764,
"grad_norm": 2.703125,
"learning_rate": 1.962378279753287e-05,
"loss": 0.1001,
"mean_token_accuracy": 0.9700573923885822,
"num_tokens": 295889252.0,
"step": 38000
},
{
"epoch": 1.0965228682729764,
"eval_entropy": 1.2859099207339308,
"eval_loss": 0.7740228176116943,
"eval_mean_token_accuracy": 0.9053704237425905,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 295889252.0,
"eval_recall": 0.5815,
"eval_runtime": 17.2227,
"eval_samples_per_second": 52.721,
"eval_steps_per_second": 52.721,
"step": 38000
},
{
"entropy": 1.172551353752613,
"epoch": 1.1542345981820805,
"grad_norm": 5.09375,
"learning_rate": 1.90288162148226e-05,
"loss": 0.0999,
"mean_token_accuracy": 0.9701334120929241,
"num_tokens": 311460812.0,
"step": 40000
},
{
"epoch": 1.1542345981820805,
"eval_entropy": 1.2553664291613953,
"eval_loss": 0.7721803784370422,
"eval_mean_token_accuracy": 0.9035136323537071,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 311460812.0,
"eval_recall": 0.576,
"eval_runtime": 16.7852,
"eval_samples_per_second": 54.095,
"eval_steps_per_second": 54.095,
"step": 40000
},
{
"entropy": 1.1619984501898288,
"epoch": 1.2119463280911846,
"grad_norm": 2.21875,
"learning_rate": 1.843384963211233e-05,
"loss": 0.0965,
"mean_token_accuracy": 0.9710877353549003,
"num_tokens": 327135410.0,
"step": 42000
},
{
"epoch": 1.2119463280911846,
"eval_entropy": 1.2579897131426219,
"eval_loss": 0.7692885398864746,
"eval_mean_token_accuracy": 0.9031982754033042,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 327135410.0,
"eval_recall": 0.576,
"eval_runtime": 17.2218,
"eval_samples_per_second": 52.724,
"eval_steps_per_second": 52.724,
"step": 42000
},
{
"entropy": 1.1411213338077069,
"epoch": 1.2696580580002885,
"grad_norm": 3.578125,
"learning_rate": 1.7838883049402058e-05,
"loss": 0.0944,
"mean_token_accuracy": 0.9715765230953693,
"num_tokens": 342774177.0,
"step": 44000
},
{
"epoch": 1.2696580580002885,
"eval_entropy": 1.253242742319464,
"eval_loss": 0.7519774436950684,
"eval_mean_token_accuracy": 0.9045354708175827,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 342774177.0,
"eval_recall": 0.576,
"eval_runtime": 17.1793,
"eval_samples_per_second": 52.854,
"eval_steps_per_second": 52.854,
"step": 44000
},
{
"entropy": 1.1595853001475334,
"epoch": 1.3273697879093926,
"grad_norm": 5.5,
"learning_rate": 1.724391646669179e-05,
"loss": 0.0963,
"mean_token_accuracy": 0.9711391851603984,
"num_tokens": 358312922.0,
"step": 46000
},
{
"epoch": 1.3273697879093926,
"eval_entropy": 1.2453804171689282,
"eval_loss": 0.7676454186439514,
"eval_mean_token_accuracy": 0.9064169454876547,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 358312922.0,
"eval_recall": 0.5837,
"eval_runtime": 16.8037,
"eval_samples_per_second": 54.036,
"eval_steps_per_second": 54.036,
"step": 46000
},
{
"entropy": 1.1609133576154709,
"epoch": 1.3850815178184965,
"grad_norm": 4.03125,
"learning_rate": 1.6648949883981517e-05,
"loss": 0.0922,
"mean_token_accuracy": 0.9723608312606812,
"num_tokens": 373752333.0,
"step": 48000
},
{
"epoch": 1.3850815178184965,
"eval_entropy": 1.2345776492934921,
"eval_loss": 0.7665285468101501,
"eval_mean_token_accuracy": 0.9063460667687365,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 373752333.0,
"eval_recall": 0.5793,
"eval_runtime": 16.7578,
"eval_samples_per_second": 54.184,
"eval_steps_per_second": 54.184,
"step": 48000
},
{
"entropy": 1.1655547478497028,
"epoch": 1.4427932477276006,
"grad_norm": 5.59375,
"learning_rate": 1.6053983301271245e-05,
"loss": 0.094,
"mean_token_accuracy": 0.9717481000125409,
"num_tokens": 389447345.0,
"step": 50000
},
{
"epoch": 1.4427932477276006,
"eval_entropy": 1.2292915042407713,
"eval_loss": 0.7735024094581604,
"eval_mean_token_accuracy": 0.907910385517822,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 389447345.0,
"eval_recall": 0.5881,
"eval_runtime": 17.3697,
"eval_samples_per_second": 52.275,
"eval_steps_per_second": 52.275,
"step": 50000
},
{
"entropy": 1.1552352701127528,
"epoch": 1.5005049776367048,
"grad_norm": 3.0,
"learning_rate": 1.5459016718560976e-05,
"loss": 0.091,
"mean_token_accuracy": 0.9726284679472447,
"num_tokens": 404935652.0,
"step": 52000
},
{
"epoch": 1.5005049776367048,
"eval_entropy": 1.2490241264325406,
"eval_loss": 0.7779573202133179,
"eval_mean_token_accuracy": 0.9046718338053132,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 404935652.0,
"eval_recall": 0.5848,
"eval_runtime": 17.2173,
"eval_samples_per_second": 52.738,
"eval_steps_per_second": 52.738,
"step": 52000
},
{
"entropy": 1.1580015743076801,
"epoch": 1.5582167075458087,
"grad_norm": 0.0019989013671875,
"learning_rate": 1.4864050135850704e-05,
"loss": 0.0674,
"mean_token_accuracy": 0.979576114565134,
"num_tokens": 15533221.0,
"step": 54000
},
{
"epoch": 1.5582167075458087,
"eval_entropy": 1.204221866138706,
"eval_loss": 0.8085830211639404,
"eval_mean_token_accuracy": 0.9048162211668124,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 15533221.0,
"eval_recall": 0.5738,
"eval_runtime": 17.1435,
"eval_samples_per_second": 52.965,
"eval_steps_per_second": 52.965,
"step": 54000
},
{
"entropy": 1.144241349697113,
"epoch": 1.6159284374549125,
"grad_norm": 3.765625,
"learning_rate": 1.4269083553140432e-05,
"loss": 0.0633,
"mean_token_accuracy": 0.9807874869704246,
"num_tokens": 31150685.0,
"step": 56000
},
{
"epoch": 1.6159284374549125,
"eval_entropy": 1.2577752770306256,
"eval_loss": 0.8108322024345398,
"eval_mean_token_accuracy": 0.904205797002179,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 31150685.0,
"eval_recall": 0.5738,
"eval_runtime": 16.4274,
"eval_samples_per_second": 55.274,
"eval_steps_per_second": 55.274,
"step": 56000
},
{
"entropy": 1.162702257514,
"epoch": 1.6736401673640167,
"grad_norm": 2.8125,
"learning_rate": 1.3674116970430161e-05,
"loss": 0.0665,
"mean_token_accuracy": 0.9797295650243759,
"num_tokens": 46832332.0,
"step": 58000
},
{
"epoch": 1.6736401673640167,
"eval_entropy": 1.268515376989537,
"eval_loss": 0.814584493637085,
"eval_mean_token_accuracy": 0.904229478295154,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 46832332.0,
"eval_recall": 0.5716,
"eval_runtime": 16.4853,
"eval_samples_per_second": 55.079,
"eval_steps_per_second": 55.079,
"step": 58000
},
{
"entropy": 1.1661596206724645,
"epoch": 1.7313518972731208,
"grad_norm": 5.96875,
"learning_rate": 1.3079150387719889e-05,
"loss": 0.0672,
"mean_token_accuracy": 0.9796462517380714,
"num_tokens": 62558817.0,
"step": 60000
},
{
"epoch": 1.7313518972731208,
"eval_entropy": 1.2622852631996382,
"eval_loss": 0.8227198123931885,
"eval_mean_token_accuracy": 0.9038923141846048,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 62558817.0,
"eval_recall": 0.5716,
"eval_runtime": 16.9948,
"eval_samples_per_second": 53.428,
"eval_steps_per_second": 53.428,
"step": 60000
},
{
"entropy": 1.1797457176148891,
"epoch": 1.789063627182225,
"grad_norm": 0.5546875,
"learning_rate": 1.2484183805009618e-05,
"loss": 0.0657,
"mean_token_accuracy": 0.980204150468111,
"num_tokens": 78074806.0,
"step": 62000
},
{
"epoch": 1.789063627182225,
"eval_entropy": 1.2418163208052975,
"eval_loss": 0.8185028433799744,
"eval_mean_token_accuracy": 0.9041991046740621,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 78074806.0,
"eval_recall": 0.5705,
"eval_runtime": 17.1144,
"eval_samples_per_second": 53.055,
"eval_steps_per_second": 53.055,
"step": 62000
},
{
"entropy": 1.1774089051187038,
"epoch": 1.8467753570913288,
"grad_norm": 5.5625,
"learning_rate": 1.1889217222299348e-05,
"loss": 0.0649,
"mean_token_accuracy": 0.9804997465908527,
"num_tokens": 93602629.0,
"step": 64000
},
{
"epoch": 1.8467753570913288,
"eval_entropy": 1.2988805646807087,
"eval_loss": 0.8260899782180786,
"eval_mean_token_accuracy": 0.9030656689523601,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 93602629.0,
"eval_recall": 0.576,
"eval_runtime": 16.1643,
"eval_samples_per_second": 56.173,
"eval_steps_per_second": 56.173,
"step": 64000
},
{
"entropy": 1.174987347126007,
"epoch": 1.9044870870004327,
"grad_norm": 5.53125,
"learning_rate": 1.1294250639589077e-05,
"loss": 0.064,
"mean_token_accuracy": 0.9806980607807636,
"num_tokens": 109249414.0,
"step": 66000
},
{
"epoch": 1.9044870870004327,
"eval_entropy": 1.2433809736489199,
"eval_loss": 0.8272661566734314,
"eval_mean_token_accuracy": 0.9028221254569319,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 109249414.0,
"eval_recall": 0.5727,
"eval_runtime": 16.3988,
"eval_samples_per_second": 55.37,
"eval_steps_per_second": 55.37,
"step": 66000
},
{
"entropy": 1.1633582679629326,
"epoch": 1.9621988169095368,
"grad_norm": 5.34375,
"learning_rate": 1.0699284056878807e-05,
"loss": 0.0643,
"mean_token_accuracy": 0.9805754337012768,
"num_tokens": 124880720.0,
"step": 68000
},
{
"epoch": 1.9621988169095368,
"eval_entropy": 1.224490842367584,
"eval_loss": 0.8288715481758118,
"eval_mean_token_accuracy": 0.9034351931991557,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 124880720.0,
"eval_recall": 0.5738,
"eval_runtime": 16.4997,
"eval_samples_per_second": 55.031,
"eval_steps_per_second": 55.031,
"step": 68000
},
{
"entropy": 1.1513627296090125,
"epoch": 2.019910546818641,
"grad_norm": 0.79296875,
"learning_rate": 1.0104317474168535e-05,
"loss": 0.0633,
"mean_token_accuracy": 0.9811660476624966,
"num_tokens": 140499220.0,
"step": 70000
},
{
"epoch": 2.019910546818641,
"eval_entropy": 1.2267822175561593,
"eval_loss": 0.8458257913589478,
"eval_mean_token_accuracy": 0.9038964834572986,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 140499220.0,
"eval_recall": 0.5705,
"eval_runtime": 16.4967,
"eval_samples_per_second": 55.041,
"eval_steps_per_second": 55.041,
"step": 70000
},
{
"entropy": 1.143776093840599,
"epoch": 2.077622276727745,
"grad_norm": 7.9375,
"learning_rate": 9.509350891458264e-06,
"loss": 0.0597,
"mean_token_accuracy": 0.9825106913745403,
"num_tokens": 156048918.0,
"step": 72000
},
{
"epoch": 2.077622276727745,
"eval_entropy": 1.2148328015195116,
"eval_loss": 0.8337165713310242,
"eval_mean_token_accuracy": 0.9035390550475814,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 156048918.0,
"eval_recall": 0.5705,
"eval_runtime": 16.506,
"eval_samples_per_second": 55.01,
"eval_steps_per_second": 55.01,
"step": 72000
},
{
"entropy": 1.1460822140574456,
"epoch": 2.135334006636849,
"grad_norm": 12.375,
"learning_rate": 8.914384308747992e-06,
"loss": 0.0596,
"mean_token_accuracy": 0.98244061678648,
"num_tokens": 171653895.0,
"step": 74000
},
{
"epoch": 2.135334006636849,
"eval_entropy": 1.2635613490175046,
"eval_loss": 0.8348618745803833,
"eval_mean_token_accuracy": 0.9038379774285308,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 171653895.0,
"eval_recall": 0.5694,
"eval_runtime": 16.4822,
"eval_samples_per_second": 55.09,
"eval_steps_per_second": 55.09,
"step": 74000
},
{
"entropy": 1.1560133908391,
"epoch": 2.193045736545953,
"grad_norm": 7.625,
"learning_rate": 8.319417726037721e-06,
"loss": 0.06,
"mean_token_accuracy": 0.9822552761137485,
"num_tokens": 187228261.0,
"step": 76000
},
{
"epoch": 2.193045736545953,
"eval_entropy": 1.2220293277554575,
"eval_loss": 0.8315507769584656,
"eval_mean_token_accuracy": 0.9036543207809263,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 187228261.0,
"eval_recall": 0.5716,
"eval_runtime": 16.703,
"eval_samples_per_second": 54.361,
"eval_steps_per_second": 54.361,
"step": 76000
},
{
"entropy": 1.1676500248610973,
"epoch": 2.250757466455057,
"grad_norm": 4.84375,
"learning_rate": 7.72445114332745e-06,
"loss": 0.0611,
"mean_token_accuracy": 0.9819406977891922,
"num_tokens": 202699683.0,
"step": 78000
},
{
"epoch": 2.250757466455057,
"eval_entropy": 1.2448954319638827,
"eval_loss": 0.8309385776519775,
"eval_mean_token_accuracy": 0.9030922418255113,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 202699683.0,
"eval_recall": 0.5694,
"eval_runtime": 16.603,
"eval_samples_per_second": 54.689,
"eval_steps_per_second": 54.689,
"step": 78000
},
{
"entropy": 1.1656713368594647,
"epoch": 2.308469196364161,
"grad_norm": 6.53125,
"learning_rate": 7.129484560617179e-06,
"loss": 0.0618,
"mean_token_accuracy": 0.9817487963140011,
"num_tokens": 218284466.0,
"step": 80000
},
{
"epoch": 2.308469196364161,
"eval_entropy": 1.255102663181952,
"eval_loss": 0.8435425162315369,
"eval_mean_token_accuracy": 0.902260869642974,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 218284466.0,
"eval_recall": 0.5661,
"eval_runtime": 16.8256,
"eval_samples_per_second": 53.965,
"eval_steps_per_second": 53.965,
"step": 80000
},
{
"entropy": 1.1597592905461789,
"epoch": 2.366180926273265,
"grad_norm": 2.5,
"learning_rate": 6.534517977906908e-06,
"loss": 0.0602,
"mean_token_accuracy": 0.9821576415896416,
"num_tokens": 233928452.0,
"step": 82000
},
{
"epoch": 2.366180926273265,
"eval_entropy": 1.2422783964924875,
"eval_loss": 0.8390738368034363,
"eval_mean_token_accuracy": 0.9032785006950605,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 233928452.0,
"eval_recall": 0.5683,
"eval_runtime": 16.7577,
"eval_samples_per_second": 54.184,
"eval_steps_per_second": 54.184,
"step": 82000
},
{
"entropy": 1.17008468157053,
"epoch": 2.4238926561823693,
"grad_norm": 0.0400390625,
"learning_rate": 5.939551395196637e-06,
"loss": 0.0591,
"mean_token_accuracy": 0.9825585896968841,
"num_tokens": 249419664.0,
"step": 84000
},
{
"epoch": 2.4238926561823693,
"eval_entropy": 1.2469606770424067,
"eval_loss": 0.8383654356002808,
"eval_mean_token_accuracy": 0.9040639832418921,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 249419664.0,
"eval_recall": 0.5705,
"eval_runtime": 16.6392,
"eval_samples_per_second": 54.57,
"eval_steps_per_second": 54.57,
"step": 84000
},
{
"entropy": 1.163529093414545,
"epoch": 2.481604386091473,
"grad_norm": 6.28125,
"learning_rate": 5.3445848124863655e-06,
"loss": 0.0568,
"mean_token_accuracy": 0.9832313210368157,
"num_tokens": 264982654.0,
"step": 86000
},
{
"epoch": 2.481604386091473,
"eval_entropy": 1.236849331908289,
"eval_loss": 0.8381890058517456,
"eval_mean_token_accuracy": 0.9027883698630438,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 264982654.0,
"eval_recall": 0.5672,
"eval_runtime": 16.5964,
"eval_samples_per_second": 54.711,
"eval_steps_per_second": 54.711,
"step": 86000
},
{
"entropy": 1.1701532056927682,
"epoch": 2.539316116000577,
"grad_norm": 0.87109375,
"learning_rate": 4.749618229776094e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.9830155865848065,
"num_tokens": 280520807.0,
"step": 88000
},
{
"epoch": 2.539316116000577,
"eval_entropy": 1.2492524392673097,
"eval_loss": 0.839518666267395,
"eval_mean_token_accuracy": 0.9025986767681685,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 280520807.0,
"eval_recall": 0.5661,
"eval_runtime": 16.5519,
"eval_samples_per_second": 54.858,
"eval_steps_per_second": 54.858,
"step": 88000
},
{
"entropy": 1.167941878914833,
"epoch": 2.597027845909681,
"grad_norm": 0.451171875,
"learning_rate": 4.154651647065824e-06,
"loss": 0.0602,
"mean_token_accuracy": 0.9820802296400071,
"num_tokens": 296146535.0,
"step": 90000
},
{
"epoch": 2.597027845909681,
"eval_entropy": 1.2443812186234848,
"eval_loss": 0.8395401835441589,
"eval_mean_token_accuracy": 0.9034286766981764,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 296146535.0,
"eval_recall": 0.5683,
"eval_runtime": 16.788,
"eval_samples_per_second": 54.086,
"eval_steps_per_second": 54.086,
"step": 90000
},
{
"entropy": 1.1601335457861424,
"epoch": 2.6547395758187853,
"grad_norm": 4.6875,
"learning_rate": 3.559685064355552e-06,
"loss": 0.0584,
"mean_token_accuracy": 0.9827592859268188,
"num_tokens": 311778551.0,
"step": 92000
},
{
"epoch": 2.6547395758187853,
"eval_entropy": 1.2437387075324415,
"eval_loss": 0.836577296257019,
"eval_mean_token_accuracy": 0.9039776291419231,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 311778551.0,
"eval_recall": 0.5705,
"eval_runtime": 16.2272,
"eval_samples_per_second": 55.956,
"eval_steps_per_second": 55.956,
"step": 92000
},
{
"entropy": 1.1733056641221047,
"epoch": 2.712451305727889,
"grad_norm": 5.03125,
"learning_rate": 2.964718481645281e-06,
"loss": 0.0564,
"mean_token_accuracy": 0.9832823853492737,
"num_tokens": 327170479.0,
"step": 94000
},
{
"epoch": 2.712451305727889,
"eval_entropy": 1.2440849004337966,
"eval_loss": 0.8399211168289185,
"eval_mean_token_accuracy": 0.9033104040155326,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 327170479.0,
"eval_recall": 0.5683,
"eval_runtime": 16.2233,
"eval_samples_per_second": 55.969,
"eval_steps_per_second": 55.969,
"step": 94000
},
{
"entropy": 1.1586334483027458,
"epoch": 2.770163035636993,
"grad_norm": 3.953125,
"learning_rate": 2.36975189893501e-06,
"loss": 0.0585,
"mean_token_accuracy": 0.9826480825543403,
"num_tokens": 342791353.0,
"step": 96000
},
{
"epoch": 2.770163035636993,
"eval_entropy": 1.2412338042180444,
"eval_loss": 0.8378188610076904,
"eval_mean_token_accuracy": 0.9035194405572005,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 342791353.0,
"eval_recall": 0.5672,
"eval_runtime": 16.2077,
"eval_samples_per_second": 56.023,
"eval_steps_per_second": 56.023,
"step": 96000
},
{
"entropy": 1.1629991734027862,
"epoch": 2.827874765546097,
"grad_norm": 6.4375,
"learning_rate": 1.7747853162247388e-06,
"loss": 0.0608,
"mean_token_accuracy": 0.9821404512822628,
"num_tokens": 358436354.0,
"step": 98000
},
{
"epoch": 2.827874765546097,
"eval_entropy": 1.2435034370369848,
"eval_loss": 0.8380420207977295,
"eval_mean_token_accuracy": 0.9037704004327631,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 358436354.0,
"eval_recall": 0.5694,
"eval_runtime": 16.2198,
"eval_samples_per_second": 55.981,
"eval_steps_per_second": 55.981,
"step": 98000
},
{
"entropy": 1.1640874392092229,
"epoch": 2.8855864954552013,
"grad_norm": 0.8203125,
"learning_rate": 1.1798187335144677e-06,
"loss": 0.0574,
"mean_token_accuracy": 0.9829988768994808,
"num_tokens": 374029027.0,
"step": 100000
},
{
"epoch": 2.8855864954552013,
"eval_entropy": 1.2438825091207606,
"eval_loss": 0.8370459079742432,
"eval_mean_token_accuracy": 0.9030486140810445,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 374029027.0,
"eval_recall": 0.5661,
"eval_runtime": 17.3712,
"eval_samples_per_second": 52.271,
"eval_steps_per_second": 52.271,
"step": 100000
},
{
"entropy": 1.1683570961356162,
"epoch": 2.9432982253643054,
"grad_norm": 6.65625,
"learning_rate": 5.848521508041964e-07,
"loss": 0.0583,
"mean_token_accuracy": 0.982835016399622,
"num_tokens": 389554889.0,
"step": 102000
},
{
"epoch": 2.9432982253643054,
"eval_entropy": 1.2444104566710636,
"eval_loss": 0.8376456499099731,
"eval_mean_token_accuracy": 0.9037148623608282,
"eval_num_gold": 908,
"eval_num_guess": 908,
"eval_num_tokens": 389554889.0,
"eval_recall": 0.5683,
"eval_runtime": 16.2408,
"eval_samples_per_second": 55.909,
"eval_steps_per_second": 55.909,
"step": 102000
}
],
"logging_steps": 2000,
"max_steps": 103965,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.6466469785747587e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}