{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.571428571428571, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07142857142857142, "grad_norm": 0.11384429782629013, "learning_rate": 9.940119760479042e-05, "loss": 1.1555, "step": 10 }, { "epoch": 0.14285714285714285, "grad_norm": 0.18670473992824554, "learning_rate": 9.820359281437126e-05, "loss": 0.986, "step": 20 }, { "epoch": 0.21428571428571427, "grad_norm": 0.1131674274802208, "learning_rate": 9.700598802395209e-05, "loss": 0.6862, "step": 30 }, { "epoch": 0.2857142857142857, "grad_norm": 0.09851589798927307, "learning_rate": 9.580838323353294e-05, "loss": 0.6073, "step": 40 }, { "epoch": 0.35714285714285715, "grad_norm": 0.07161638140678406, "learning_rate": 9.461077844311378e-05, "loss": 0.5746, "step": 50 }, { "epoch": 0.42857142857142855, "grad_norm": 0.060462553054094315, "learning_rate": 9.341317365269462e-05, "loss": 0.568, "step": 60 }, { "epoch": 0.5, "grad_norm": 0.06703554838895798, "learning_rate": 9.221556886227547e-05, "loss": 0.571, "step": 70 }, { "epoch": 0.5714285714285714, "grad_norm": 0.06138037517666817, "learning_rate": 9.101796407185628e-05, "loss": 0.5884, "step": 80 }, { "epoch": 0.6428571428571429, "grad_norm": 0.06791023164987564, "learning_rate": 8.982035928143712e-05, "loss": 0.5681, "step": 90 }, { "epoch": 0.7142857142857143, "grad_norm": 0.06872102618217468, "learning_rate": 8.862275449101797e-05, "loss": 0.5602, "step": 100 }, { "epoch": 0.7857142857142857, "grad_norm": 0.07038596272468567, "learning_rate": 8.742514970059881e-05, "loss": 0.5587, "step": 110 }, { "epoch": 0.8571428571428571, "grad_norm": 0.07541386038064957, "learning_rate": 8.622754491017964e-05, "loss": 0.5442, "step": 120 }, { "epoch": 0.9285714285714286, "grad_norm": 0.09144806116819382, "learning_rate": 8.502994011976048e-05, "loss": 0.5272, "step": 130 }, { "epoch": 1.0, "grad_norm": 0.10239645093679428, "learning_rate": 8.383233532934131e-05, "loss": 0.5709, "step": 140 }, { "epoch": 1.0714285714285714, "grad_norm": 0.0748690515756607, "learning_rate": 8.263473053892216e-05, "loss": 0.5114, "step": 150 }, { "epoch": 1.1428571428571428, "grad_norm": 0.08167172223329544, "learning_rate": 8.1437125748503e-05, "loss": 0.5227, "step": 160 }, { "epoch": 1.2142857142857142, "grad_norm": 0.1070200651884079, "learning_rate": 8.023952095808383e-05, "loss": 0.5444, "step": 170 }, { "epoch": 1.2857142857142856, "grad_norm": 0.09023091942071915, "learning_rate": 7.904191616766467e-05, "loss": 0.5313, "step": 180 }, { "epoch": 1.3571428571428572, "grad_norm": 0.11614037305116653, "learning_rate": 7.784431137724552e-05, "loss": 0.5109, "step": 190 }, { "epoch": 1.4285714285714286, "grad_norm": 0.09616102278232574, "learning_rate": 7.664670658682636e-05, "loss": 0.5205, "step": 200 }, { "epoch": 1.5, "grad_norm": 0.1161426305770874, "learning_rate": 7.544910179640719e-05, "loss": 0.4951, "step": 210 }, { "epoch": 1.5714285714285714, "grad_norm": 0.11568252742290497, "learning_rate": 7.425149700598802e-05, "loss": 0.5076, "step": 220 }, { "epoch": 1.6428571428571428, "grad_norm": 0.09209095686674118, "learning_rate": 7.305389221556886e-05, "loss": 0.4703, "step": 230 }, { "epoch": 1.7142857142857144, "grad_norm": 0.11663572490215302, "learning_rate": 7.18562874251497e-05, "loss": 0.5073, "step": 240 }, { "epoch": 1.7857142857142856, "grad_norm": 0.11245130747556686, "learning_rate": 7.065868263473055e-05, "loss": 0.4897, "step": 250 }, { "epoch": 1.8571428571428572, "grad_norm": 0.12360594421625137, "learning_rate": 6.946107784431138e-05, "loss": 0.5232, "step": 260 }, { "epoch": 1.9285714285714286, "grad_norm": 0.11183296144008636, "learning_rate": 6.826347305389222e-05, "loss": 0.5185, "step": 270 }, { "epoch": 2.0, "grad_norm": 0.13983388245105743, "learning_rate": 6.706586826347305e-05, "loss": 0.4794, "step": 280 }, { "epoch": 2.0714285714285716, "grad_norm": 0.11507315188646317, "learning_rate": 6.58682634730539e-05, "loss": 0.4637, "step": 290 }, { "epoch": 2.142857142857143, "grad_norm": 0.13093125820159912, "learning_rate": 6.467065868263474e-05, "loss": 0.4681, "step": 300 }, { "epoch": 2.2142857142857144, "grad_norm": 0.13779377937316895, "learning_rate": 6.347305389221557e-05, "loss": 0.4643, "step": 310 }, { "epoch": 2.2857142857142856, "grad_norm": 0.14057691395282745, "learning_rate": 6.227544910179641e-05, "loss": 0.5048, "step": 320 }, { "epoch": 2.357142857142857, "grad_norm": 0.12082990258932114, "learning_rate": 6.107784431137725e-05, "loss": 0.4655, "step": 330 }, { "epoch": 2.4285714285714284, "grad_norm": 0.16593365371227264, "learning_rate": 5.988023952095808e-05, "loss": 0.4419, "step": 340 }, { "epoch": 2.5, "grad_norm": 0.13827389478683472, "learning_rate": 5.868263473053892e-05, "loss": 0.4505, "step": 350 }, { "epoch": 2.571428571428571, "grad_norm": 0.13826914131641388, "learning_rate": 5.748502994011976e-05, "loss": 0.4786, "step": 360 }, { "epoch": 2.642857142857143, "grad_norm": 0.13597093522548676, "learning_rate": 5.62874251497006e-05, "loss": 0.4541, "step": 370 }, { "epoch": 2.7142857142857144, "grad_norm": 0.16901850700378418, "learning_rate": 5.508982035928144e-05, "loss": 0.4613, "step": 380 }, { "epoch": 2.7857142857142856, "grad_norm": 0.15610496699810028, "learning_rate": 5.389221556886228e-05, "loss": 0.4725, "step": 390 }, { "epoch": 2.857142857142857, "grad_norm": 0.14943359792232513, "learning_rate": 5.269461077844312e-05, "loss": 0.4736, "step": 400 }, { "epoch": 2.928571428571429, "grad_norm": 0.15164442360401154, "learning_rate": 5.149700598802395e-05, "loss": 0.4526, "step": 410 }, { "epoch": 3.0, "grad_norm": 0.19106097519397736, "learning_rate": 5.029940119760479e-05, "loss": 0.5182, "step": 420 }, { "epoch": 3.0714285714285716, "grad_norm": 0.1616070717573166, "learning_rate": 4.910179640718563e-05, "loss": 0.4176, "step": 430 }, { "epoch": 3.142857142857143, "grad_norm": 0.1612323522567749, "learning_rate": 4.790419161676647e-05, "loss": 0.4299, "step": 440 }, { "epoch": 3.2142857142857144, "grad_norm": 0.16307303309440613, "learning_rate": 4.670658682634731e-05, "loss": 0.4555, "step": 450 }, { "epoch": 3.2857142857142856, "grad_norm": 0.18952146172523499, "learning_rate": 4.550898203592814e-05, "loss": 0.4005, "step": 460 }, { "epoch": 3.357142857142857, "grad_norm": 0.1773829460144043, "learning_rate": 4.4311377245508984e-05, "loss": 0.4258, "step": 470 }, { "epoch": 3.4285714285714284, "grad_norm": 0.1871429979801178, "learning_rate": 4.311377245508982e-05, "loss": 0.4363, "step": 480 }, { "epoch": 3.5, "grad_norm": 0.19548629224300385, "learning_rate": 4.191616766467066e-05, "loss": 0.4218, "step": 490 }, { "epoch": 3.571428571428571, "grad_norm": 0.17752481997013092, "learning_rate": 4.07185628742515e-05, "loss": 0.4383, "step": 500 } ], "logging_steps": 10, "max_steps": 840, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.06922293442347e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }