{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99968, "eval_steps": 500, "global_step": 781, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0128, "grad_norm": 6.701968747476725, "learning_rate": 1.2658227848101267e-06, "loss": 2.0492, "step": 10 }, { "epoch": 0.0256, "grad_norm": 3.5864792739638993, "learning_rate": 2.5316455696202535e-06, "loss": 1.8655, "step": 20 }, { "epoch": 0.0384, "grad_norm": 2.259761356336166, "learning_rate": 3.7974683544303802e-06, "loss": 1.4331, "step": 30 }, { "epoch": 0.0512, "grad_norm": 1.2720768014744341, "learning_rate": 5.063291139240507e-06, "loss": 1.0761, "step": 40 }, { "epoch": 0.064, "grad_norm": 1.0682982350883623, "learning_rate": 6.329113924050634e-06, "loss": 0.8806, "step": 50 }, { "epoch": 0.0768, "grad_norm": 0.9449369476401172, "learning_rate": 7.5949367088607605e-06, "loss": 0.7862, "step": 60 }, { "epoch": 0.0896, "grad_norm": 0.838448848722395, "learning_rate": 8.860759493670886e-06, "loss": 0.7079, "step": 70 }, { "epoch": 0.1024, "grad_norm": 0.8665745666024749, "learning_rate": 9.99994993147413e-06, "loss": 0.6374, "step": 80 }, { "epoch": 0.1152, "grad_norm": 0.9204076286525019, "learning_rate": 9.993942921593858e-06, "loss": 0.6237, "step": 90 }, { "epoch": 0.128, "grad_norm": 0.8161132171073118, "learning_rate": 9.977935989714594e-06, "loss": 0.5959, "step": 100 }, { "epoch": 0.1408, "grad_norm": 0.8084929937439845, "learning_rate": 9.951961188279216e-06, "loss": 0.5679, "step": 110 }, { "epoch": 0.1536, "grad_norm": 0.8088758200190634, "learning_rate": 9.916070529493785e-06, "loss": 0.5446, "step": 120 }, { "epoch": 0.1664, "grad_norm": 0.9130344418330479, "learning_rate": 9.870335881177774e-06, "loss": 0.5291, "step": 130 }, { "epoch": 0.1792, "grad_norm": 0.872718919094986, "learning_rate": 9.814848822855216e-06, "loss": 0.5222, "step": 140 }, { "epoch": 0.192, "grad_norm": 0.8235461664054124, "learning_rate": 9.749720462374939e-06, "loss": 0.5171, "step": 150 }, { "epoch": 0.2048, "grad_norm": 0.7993561717725581, "learning_rate": 9.675081213427076e-06, "loss": 0.501, "step": 160 }, { "epoch": 0.2176, "grad_norm": 0.8630181557231864, "learning_rate": 9.591080534401371e-06, "loss": 0.4905, "step": 170 }, { "epoch": 0.2304, "grad_norm": 0.7780745671601388, "learning_rate": 9.497886629110187e-06, "loss": 0.4887, "step": 180 }, { "epoch": 0.2432, "grad_norm": 0.8530578246740471, "learning_rate": 9.395686109975475e-06, "loss": 0.4754, "step": 190 }, { "epoch": 0.256, "grad_norm": 0.8589511545486503, "learning_rate": 9.284683624354172e-06, "loss": 0.4681, "step": 200 }, { "epoch": 0.2688, "grad_norm": 0.8106070665074115, "learning_rate": 9.165101444750259e-06, "loss": 0.4503, "step": 210 }, { "epoch": 0.2816, "grad_norm": 0.828720382021103, "learning_rate": 9.037179023734036e-06, "loss": 0.4505, "step": 220 }, { "epoch": 0.2944, "grad_norm": 0.7971023327381064, "learning_rate": 8.901172514459864e-06, "loss": 0.4492, "step": 230 }, { "epoch": 0.3072, "grad_norm": 0.7508670805091862, "learning_rate": 8.757354257742501e-06, "loss": 0.4428, "step": 240 }, { "epoch": 0.32, "grad_norm": 0.8374186185876672, "learning_rate": 8.606012236719073e-06, "loss": 0.4357, "step": 250 }, { "epoch": 0.3328, "grad_norm": 0.8494045534506517, "learning_rate": 8.447449500188731e-06, "loss": 0.4335, "step": 260 }, { "epoch": 0.3456, "grad_norm": 0.8159143844721547, "learning_rate": 8.28198355578465e-06, "loss": 0.4249, "step": 270 }, { "epoch": 0.3584, "grad_norm": 0.7945294112933121, "learning_rate": 8.10994573419352e-06, "loss": 0.417, "step": 280 }, { "epoch": 0.3712, "grad_norm": 0.8092393955680712, "learning_rate": 7.931680525695634e-06, "loss": 0.4101, "step": 290 }, { "epoch": 0.384, "grad_norm": 0.8268532168275566, "learning_rate": 7.747544890354031e-06, "loss": 0.4102, "step": 300 }, { "epoch": 0.3968, "grad_norm": 0.8307675190521085, "learning_rate": 7.557907543234051e-06, "loss": 0.4005, "step": 310 }, { "epoch": 0.4096, "grad_norm": 0.8066842247859759, "learning_rate": 7.363148216084548e-06, "loss": 0.3938, "step": 320 }, { "epoch": 0.4224, "grad_norm": 0.8219121679671916, "learning_rate": 7.163656896959181e-06, "loss": 0.4025, "step": 330 }, { "epoch": 0.4352, "grad_norm": 0.7806012947481233, "learning_rate": 6.959833049300376e-06, "loss": 0.3917, "step": 340 }, { "epoch": 0.448, "grad_norm": 0.7583628073160137, "learning_rate": 6.75208481204967e-06, "loss": 0.3956, "step": 350 }, { "epoch": 0.4608, "grad_norm": 0.7932654942930134, "learning_rate": 6.540828182386154e-06, "loss": 0.3863, "step": 360 }, { "epoch": 0.4736, "grad_norm": 0.7950343391931867, "learning_rate": 6.326486182729504e-06, "loss": 0.386, "step": 370 }, { "epoch": 0.4864, "grad_norm": 0.7556004511705458, "learning_rate": 6.1094880136755886e-06, "loss": 0.3876, "step": 380 }, { "epoch": 0.4992, "grad_norm": 0.7878348206227572, "learning_rate": 5.890268194560834e-06, "loss": 0.3751, "step": 390 }, { "epoch": 0.512, "grad_norm": 0.9576214818021522, "learning_rate": 5.669265693376309e-06, "loss": 0.3773, "step": 400 }, { "epoch": 0.5248, "grad_norm": 0.7598363460040048, "learning_rate": 5.4469230477737466e-06, "loss": 0.3785, "step": 410 }, { "epoch": 0.5376, "grad_norm": 0.7420324190383087, "learning_rate": 5.223685478923671e-06, "loss": 0.3641, "step": 420 }, { "epoch": 0.5504, "grad_norm": 0.8017146285762878, "learning_rate": 5e-06, "loss": 0.3769, "step": 430 }, { "epoch": 0.5632, "grad_norm": 0.7670735510600653, "learning_rate": 4.77631452107633e-06, "loss": 0.3747, "step": 440 }, { "epoch": 0.576, "grad_norm": 0.7840762329546958, "learning_rate": 4.553076952226255e-06, "loss": 0.3539, "step": 450 }, { "epoch": 0.5888, "grad_norm": 0.8119170658398085, "learning_rate": 4.330734306623694e-06, "loss": 0.3702, "step": 460 }, { "epoch": 0.6016, "grad_norm": 0.7778368273813389, "learning_rate": 4.109731805439168e-06, "loss": 0.36, "step": 470 }, { "epoch": 0.6144, "grad_norm": 0.7979085157539837, "learning_rate": 3.890511986324413e-06, "loss": 0.3655, "step": 480 }, { "epoch": 0.6272, "grad_norm": 0.8322448925414319, "learning_rate": 3.6735138172704967e-06, "loss": 0.355, "step": 490 }, { "epoch": 0.64, "grad_norm": 0.8096327876710914, "learning_rate": 3.459171817613847e-06, "loss": 0.3593, "step": 500 }, { "epoch": 0.6528, "grad_norm": 0.801578229202024, "learning_rate": 3.2479151879503324e-06, "loss": 0.3547, "step": 510 }, { "epoch": 0.6656, "grad_norm": 0.7715945441438417, "learning_rate": 3.040166950699626e-06, "loss": 0.3564, "step": 520 }, { "epoch": 0.6784, "grad_norm": 0.7724629808163839, "learning_rate": 2.836343103040819e-06, "loss": 0.3455, "step": 530 }, { "epoch": 0.6912, "grad_norm": 0.7703132105218164, "learning_rate": 2.636851783915454e-06, "loss": 0.3574, "step": 540 }, { "epoch": 0.704, "grad_norm": 0.8346457043519747, "learning_rate": 2.4420924567659508e-06, "loss": 0.3563, "step": 550 }, { "epoch": 0.7168, "grad_norm": 0.7478114203409938, "learning_rate": 2.2524551096459703e-06, "loss": 0.3442, "step": 560 }, { "epoch": 0.7296, "grad_norm": 0.7175822746663116, "learning_rate": 2.068319474304365e-06, "loss": 0.3513, "step": 570 }, { "epoch": 0.7424, "grad_norm": 0.7765522703854497, "learning_rate": 1.8900542658064807e-06, "loss": 0.3505, "step": 580 }, { "epoch": 0.7552, "grad_norm": 0.8131308220238767, "learning_rate": 1.7180164442153529e-06, "loss": 0.3436, "step": 590 }, { "epoch": 0.768, "grad_norm": 0.7675648858585707, "learning_rate": 1.5525504998112717e-06, "loss": 0.3492, "step": 600 }, { "epoch": 0.7808, "grad_norm": 0.7848374060613897, "learning_rate": 1.3939877632809279e-06, "loss": 0.3376, "step": 610 }, { "epoch": 0.7936, "grad_norm": 0.7172688497141017, "learning_rate": 1.2426457422575e-06, "loss": 0.3509, "step": 620 }, { "epoch": 0.8064, "grad_norm": 0.7513406260924876, "learning_rate": 1.0988274855401377e-06, "loss": 0.3427, "step": 630 }, { "epoch": 0.8192, "grad_norm": 0.719098637869978, "learning_rate": 9.628209762659658e-07, "loss": 0.352, "step": 640 }, { "epoch": 0.832, "grad_norm": 0.7878403671063132, "learning_rate": 8.348985552497424e-07, "loss": 0.3512, "step": 650 }, { "epoch": 0.8448, "grad_norm": 0.778349621811645, "learning_rate": 7.153163756458287e-07, "loss": 0.3418, "step": 660 }, { "epoch": 0.8576, "grad_norm": 0.7627951845894294, "learning_rate": 6.043138900245277e-07, "loss": 0.3415, "step": 670 }, { "epoch": 0.8704, "grad_norm": 0.7267205608588538, "learning_rate": 5.021133708898146e-07, "loss": 0.3498, "step": 680 }, { "epoch": 0.8832, "grad_norm": 0.774296421867312, "learning_rate": 4.089194655986306e-07, "loss": 0.3418, "step": 690 }, { "epoch": 0.896, "grad_norm": 0.7629885918292927, "learning_rate": 3.2491878657292643e-07, "loss": 0.3417, "step": 700 }, { "epoch": 0.9088, "grad_norm": 0.7684825905537629, "learning_rate": 2.502795376250622e-07, "loss": 0.3414, "step": 710 }, { "epoch": 0.9216, "grad_norm": 0.7495748143605184, "learning_rate": 1.8515117714478447e-07, "loss": 0.3303, "step": 720 }, { "epoch": 0.9344, "grad_norm": 0.7729420979175768, "learning_rate": 1.2966411882222695e-07, "loss": 0.3516, "step": 730 }, { "epoch": 0.9472, "grad_norm": 0.7645078415028198, "learning_rate": 8.392947050621603e-08, "loss": 0.3459, "step": 740 }, { "epoch": 0.96, "grad_norm": 0.7401931558129734, "learning_rate": 4.803881172078473e-08, "loss": 0.3381, "step": 750 }, { "epoch": 0.9728, "grad_norm": 0.7847554015277991, "learning_rate": 2.206401028540639e-08, "loss": 0.3435, "step": 760 }, { "epoch": 0.9856, "grad_norm": 0.8032295395801471, "learning_rate": 6.057078406142003e-09, "loss": 0.3533, "step": 770 }, { "epoch": 0.9984, "grad_norm": 0.7538439870936887, "learning_rate": 5.0068525870305974e-11, "loss": 0.3355, "step": 780 }, { "epoch": 0.99968, "step": 781, "total_flos": 51822668021760.0, "train_loss": 0.4785180231902114, "train_runtime": 3564.8554, "train_samples_per_second": 28.052, "train_steps_per_second": 0.219 } ], "logging_steps": 10, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 51822668021760.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }