Omni-7B-sft-mix-0920 / trainer_state.json
hrw's picture
Upload folder using huggingface_hub
4822576 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100.0,
"global_step": 210,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0096,
"grad_norm": 0.9936648607254028,
"learning_rate": 7.142857142857143e-07,
"loss": 1.3031667470932007,
"memory(GiB)": 27.78,
"step": 1,
"token_acc": 0.6664092664092665,
"train_speed(iter/s)": 0.077355
},
{
"epoch": 0.048,
"grad_norm": 0.9462715983390808,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.2822166681289673,
"memory(GiB)": 37.31,
"step": 5,
"token_acc": 0.6730212596067472,
"train_speed(iter/s)": 0.117204
},
{
"epoch": 0.096,
"grad_norm": 0.9813357591629028,
"learning_rate": 4.997306095597203e-06,
"loss": 1.2855051040649415,
"memory(GiB)": 37.33,
"step": 10,
"token_acc": 0.6950031286314472,
"train_speed(iter/s)": 0.12742
},
{
"epoch": 0.144,
"grad_norm": 1.0169689655303955,
"learning_rate": 4.980864366515159e-06,
"loss": 1.3038410186767577,
"memory(GiB)": 37.33,
"step": 15,
"token_acc": 0.6799628158539677,
"train_speed(iter/s)": 0.131128
},
{
"epoch": 0.192,
"grad_norm": 1.027754545211792,
"learning_rate": 4.949575798648962e-06,
"loss": 1.2775081634521483,
"memory(GiB)": 37.33,
"step": 20,
"token_acc": 0.6861788617886179,
"train_speed(iter/s)": 0.133544
},
{
"epoch": 0.24,
"grad_norm": 0.9780960083007812,
"learning_rate": 4.903627639769656e-06,
"loss": 1.269430446624756,
"memory(GiB)": 40.76,
"step": 25,
"token_acc": 0.6818361069484975,
"train_speed(iter/s)": 0.134589
},
{
"epoch": 0.288,
"grad_norm": 0.9856972694396973,
"learning_rate": 4.8432948685969646e-06,
"loss": 1.2507415771484376,
"memory(GiB)": 40.77,
"step": 30,
"token_acc": 0.6933913731697665,
"train_speed(iter/s)": 0.134694
},
{
"epoch": 0.336,
"grad_norm": 0.9181770086288452,
"learning_rate": 4.7689385491773934e-06,
"loss": 1.2203140258789062,
"memory(GiB)": 40.77,
"step": 35,
"token_acc": 0.6926996324757768,
"train_speed(iter/s)": 0.135293
},
{
"epoch": 0.384,
"grad_norm": 0.8927274346351624,
"learning_rate": 4.681003670081015e-06,
"loss": 1.165791893005371,
"memory(GiB)": 40.77,
"step": 40,
"token_acc": 0.6992338742894801,
"train_speed(iter/s)": 0.13613
},
{
"epoch": 0.432,
"grad_norm": 0.8748621344566345,
"learning_rate": 4.580016481348367e-06,
"loss": 1.156367015838623,
"memory(GiB)": 44.42,
"step": 45,
"token_acc": 0.6986509140257045,
"train_speed(iter/s)": 0.136251
},
{
"epoch": 0.48,
"grad_norm": 0.8102411031723022,
"learning_rate": 4.466581345124605e-06,
"loss": 1.1229158401489259,
"memory(GiB)": 44.42,
"step": 50,
"token_acc": 0.7167247090800986,
"train_speed(iter/s)": 0.136659
},
{
"epoch": 0.528,
"grad_norm": 0.8838515281677246,
"learning_rate": 4.341377118828415e-06,
"loss": 1.107612419128418,
"memory(GiB)": 48.14,
"step": 55,
"token_acc": 0.7120900797238432,
"train_speed(iter/s)": 0.136689
},
{
"epoch": 0.576,
"grad_norm": 0.8315885663032532,
"learning_rate": 4.205153092500805e-06,
"loss": 1.0878031730651856,
"memory(GiB)": 48.14,
"step": 60,
"token_acc": 0.715117320437041,
"train_speed(iter/s)": 0.13697
},
{
"epoch": 0.624,
"grad_norm": 0.8217139840126038,
"learning_rate": 4.058724504646834e-06,
"loss": 1.0752467155456542,
"memory(GiB)": 48.14,
"step": 65,
"token_acc": 0.7134517984376342,
"train_speed(iter/s)": 0.137088
},
{
"epoch": 0.672,
"grad_norm": 0.8661554455757141,
"learning_rate": 3.9029676634059565e-06,
"loss": 1.0480419158935548,
"memory(GiB)": 48.14,
"step": 70,
"token_acc": 0.720516548362268,
"train_speed(iter/s)": 0.137508
},
{
"epoch": 0.72,
"grad_norm": 0.7952479124069214,
"learning_rate": 3.738814702248524e-06,
"loss": 1.0351552963256836,
"memory(GiB)": 48.14,
"step": 75,
"token_acc": 0.7191632811824704,
"train_speed(iter/s)": 0.137541
},
{
"epoch": 0.768,
"grad_norm": 0.7449638247489929,
"learning_rate": 3.5672480015832117e-06,
"loss": 1.005133056640625,
"memory(GiB)": 52.23,
"step": 80,
"token_acc": 0.7209397177583816,
"train_speed(iter/s)": 0.137691
},
{
"epoch": 0.816,
"grad_norm": 0.7283169627189636,
"learning_rate": 3.3892943096594968e-06,
"loss": 0.9723712921142578,
"memory(GiB)": 52.23,
"step": 85,
"token_acc": 0.7297183690626313,
"train_speed(iter/s)": 0.13788
},
{
"epoch": 0.864,
"grad_norm": 0.7156486511230469,
"learning_rate": 3.206018597948893e-06,
"loss": 0.9777496337890625,
"memory(GiB)": 52.23,
"step": 90,
"token_acc": 0.7360159362549801,
"train_speed(iter/s)": 0.138013
},
{
"epoch": 0.912,
"grad_norm": 0.6714794635772705,
"learning_rate": 3.018517687777688e-06,
"loss": 0.9577500343322753,
"memory(GiB)": 52.23,
"step": 95,
"token_acc": 0.7401526717557252,
"train_speed(iter/s)": 0.138017
},
{
"epoch": 0.96,
"grad_norm": 0.6295478940010071,
"learning_rate": 2.827913686352856e-06,
"loss": 0.9314702987670899,
"memory(GiB)": 52.23,
"step": 100,
"token_acc": 0.7466477222271557,
"train_speed(iter/s)": 0.138222
},
{
"epoch": 1.0,
"grad_norm": 0.6380690336227417,
"learning_rate": 2.6353472714635443e-06,
"loss": 0.9214505195617676,
"memory(GiB)": 52.23,
"step": 105,
"token_acc": 0.7306868047759976,
"train_speed(iter/s)": 0.135794
},
{
"epoch": 1.048,
"grad_norm": 0.6720516085624695,
"learning_rate": 2.441970865046111e-06,
"loss": 0.9319070816040039,
"memory(GiB)": 52.23,
"step": 110,
"token_acc": 0.737565968335199,
"train_speed(iter/s)": 0.13602
},
{
"epoch": 1.096,
"grad_norm": 0.7207827568054199,
"learning_rate": 2.2489417364658194e-06,
"loss": 0.9134335517883301,
"memory(GiB)": 52.23,
"step": 115,
"token_acc": 0.7528385866109547,
"train_speed(iter/s)": 0.136276
},
{
"epoch": 1.144,
"grad_norm": 0.6305469870567322,
"learning_rate": 2.0574150767888795e-06,
"loss": 0.9125921249389648,
"memory(GiB)": 52.23,
"step": 120,
"token_acc": 0.7383805374001452,
"train_speed(iter/s)": 0.136526
},
{
"epoch": 1.192,
"grad_norm": 0.6620354056358337,
"learning_rate": 1.8685370854921631e-06,
"loss": 0.9069900512695312,
"memory(GiB)": 52.23,
"step": 125,
"token_acc": 0.7528492866820754,
"train_speed(iter/s)": 0.136582
},
{
"epoch": 1.24,
"grad_norm": 0.6394309401512146,
"learning_rate": 1.6834381109834696e-06,
"loss": 0.895142650604248,
"memory(GiB)": 52.23,
"step": 130,
"token_acc": 0.7488820899035067,
"train_speed(iter/s)": 0.136691
},
{
"epoch": 1.288,
"grad_norm": 0.5836830735206604,
"learning_rate": 1.5032258859831916e-06,
"loss": 0.8956113815307617,
"memory(GiB)": 52.23,
"step": 135,
"token_acc": 0.7457598371777476,
"train_speed(iter/s)": 0.13675
},
{
"epoch": 1.336,
"grad_norm": 0.5948861837387085,
"learning_rate": 1.328978898250525e-06,
"loss": 0.8733110427856445,
"memory(GiB)": 52.23,
"step": 140,
"token_acc": 0.7442634138019278,
"train_speed(iter/s)": 0.136942
},
{
"epoch": 1.384,
"grad_norm": 0.6282215118408203,
"learning_rate": 1.1617399363274024e-06,
"loss": 0.8916261672973633,
"memory(GiB)": 52.23,
"step": 145,
"token_acc": 0.7513582342954159,
"train_speed(iter/s)": 0.136986
},
{
"epoch": 1.432,
"grad_norm": 0.5960206389427185,
"learning_rate": 1.0025098489259161e-06,
"loss": 0.8885273933410645,
"memory(GiB)": 52.23,
"step": 150,
"token_acc": 0.7522109400589584,
"train_speed(iter/s)": 0.137046
},
{
"epoch": 1.48,
"grad_norm": 0.5816349983215332,
"learning_rate": 8.522415553064433e-07,
"loss": 0.8631435394287109,
"memory(GiB)": 52.23,
"step": 155,
"token_acc": 0.7504964171630838,
"train_speed(iter/s)": 0.137186
},
{
"epoch": 1.528,
"grad_norm": 0.5882980823516846,
"learning_rate": 7.118343424916249e-07,
"loss": 0.8616400718688965,
"memory(GiB)": 52.23,
"step": 160,
"token_acc": 0.7499206349206349,
"train_speed(iter/s)": 0.137302
},
{
"epoch": 1.576,
"grad_norm": 0.6468539237976074,
"learning_rate": 5.821284834447586e-07,
"loss": 0.8487722396850585,
"memory(GiB)": 52.23,
"step": 165,
"token_acc": 0.7601775198429632,
"train_speed(iter/s)": 0.137471
},
{
"epoch": 1.624,
"grad_norm": 0.5706282258033752,
"learning_rate": 4.6390020842035755e-07,
"loss": 0.8451438903808594,
"memory(GiB)": 52.23,
"step": 170,
"token_acc": 0.7554143126177024,
"train_speed(iter/s)": 0.137542
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.6457207798957825,
"learning_rate": 3.578570595810274e-07,
"loss": 0.8587837219238281,
"memory(GiB)": 52.23,
"step": 175,
"token_acc": 0.7626084715181717,
"train_speed(iter/s)": 0.137627
},
{
"epoch": 1.72,
"grad_norm": 0.5656464695930481,
"learning_rate": 2.646336566811686e-07,
"loss": 0.8507623672485352,
"memory(GiB)": 52.23,
"step": 180,
"token_acc": 0.757103655837833,
"train_speed(iter/s)": 0.137634
},
{
"epoch": 1.768,
"grad_norm": 0.5902859568595886,
"learning_rate": 1.847878991579477e-07,
"loss": 0.8512592315673828,
"memory(GiB)": 56.56,
"step": 185,
"token_acc": 0.7558876383240329,
"train_speed(iter/s)": 0.13778
},
{
"epoch": 1.8159999999999998,
"grad_norm": 0.5729983448982239,
"learning_rate": 1.1879762735828081e-07,
"loss": 0.8604719161987304,
"memory(GiB)": 56.56,
"step": 190,
"token_acc": 0.7497445571013126,
"train_speed(iter/s)": 0.137818
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.5367587208747864,
"learning_rate": 6.705776288286281e-08,
"loss": 0.8579318046569824,
"memory(GiB)": 56.56,
"step": 195,
"token_acc": 0.7590520658972778,
"train_speed(iter/s)": 0.137879
},
{
"epoch": 1.912,
"grad_norm": 0.5742630362510681,
"learning_rate": 2.987794516097875e-08,
"loss": 0.866505241394043,
"memory(GiB)": 56.56,
"step": 200,
"token_acc": 0.7586411889596603,
"train_speed(iter/s)": 0.137909
},
{
"epoch": 1.96,
"grad_norm": 0.5835118293762207,
"learning_rate": 7.480678400109965e-09,
"loss": 0.8637813568115235,
"memory(GiB)": 56.56,
"step": 205,
"token_acc": 0.7485916626428456,
"train_speed(iter/s)": 0.136252
},
{
"epoch": 2.0,
"grad_norm": 0.6009649634361267,
"learning_rate": 0.0,
"loss": 0.8247391700744628,
"memory(GiB)": 61.37,
"step": 210,
"token_acc": 0.770509227944729,
"train_speed(iter/s)": 0.136863
}
],
"logging_steps": 5,
"max_steps": 210,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.529829855770706e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}