| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 100.0, | |
| "global_step": 210, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 0.9936648607254028, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 1.3031667470932007, | |
| "memory(GiB)": 27.78, | |
| "step": 1, | |
| "token_acc": 0.6664092664092665, | |
| "train_speed(iter/s)": 0.077355 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.9462715983390808, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.2822166681289673, | |
| "memory(GiB)": 37.31, | |
| "step": 5, | |
| "token_acc": 0.6730212596067472, | |
| "train_speed(iter/s)": 0.117204 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.9813357591629028, | |
| "learning_rate": 4.997306095597203e-06, | |
| "loss": 1.2855051040649415, | |
| "memory(GiB)": 37.33, | |
| "step": 10, | |
| "token_acc": 0.6950031286314472, | |
| "train_speed(iter/s)": 0.12742 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 1.0169689655303955, | |
| "learning_rate": 4.980864366515159e-06, | |
| "loss": 1.3038410186767577, | |
| "memory(GiB)": 37.33, | |
| "step": 15, | |
| "token_acc": 0.6799628158539677, | |
| "train_speed(iter/s)": 0.131128 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 1.027754545211792, | |
| "learning_rate": 4.949575798648962e-06, | |
| "loss": 1.2775081634521483, | |
| "memory(GiB)": 37.33, | |
| "step": 20, | |
| "token_acc": 0.6861788617886179, | |
| "train_speed(iter/s)": 0.133544 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.9780960083007812, | |
| "learning_rate": 4.903627639769656e-06, | |
| "loss": 1.269430446624756, | |
| "memory(GiB)": 40.76, | |
| "step": 25, | |
| "token_acc": 0.6818361069484975, | |
| "train_speed(iter/s)": 0.134589 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.9856972694396973, | |
| "learning_rate": 4.8432948685969646e-06, | |
| "loss": 1.2507415771484376, | |
| "memory(GiB)": 40.77, | |
| "step": 30, | |
| "token_acc": 0.6933913731697665, | |
| "train_speed(iter/s)": 0.134694 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.9181770086288452, | |
| "learning_rate": 4.7689385491773934e-06, | |
| "loss": 1.2203140258789062, | |
| "memory(GiB)": 40.77, | |
| "step": 35, | |
| "token_acc": 0.6926996324757768, | |
| "train_speed(iter/s)": 0.135293 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.8927274346351624, | |
| "learning_rate": 4.681003670081015e-06, | |
| "loss": 1.165791893005371, | |
| "memory(GiB)": 40.77, | |
| "step": 40, | |
| "token_acc": 0.6992338742894801, | |
| "train_speed(iter/s)": 0.13613 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.8748621344566345, | |
| "learning_rate": 4.580016481348367e-06, | |
| "loss": 1.156367015838623, | |
| "memory(GiB)": 44.42, | |
| "step": 45, | |
| "token_acc": 0.6986509140257045, | |
| "train_speed(iter/s)": 0.136251 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.8102411031723022, | |
| "learning_rate": 4.466581345124605e-06, | |
| "loss": 1.1229158401489259, | |
| "memory(GiB)": 44.42, | |
| "step": 50, | |
| "token_acc": 0.7167247090800986, | |
| "train_speed(iter/s)": 0.136659 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.8838515281677246, | |
| "learning_rate": 4.341377118828415e-06, | |
| "loss": 1.107612419128418, | |
| "memory(GiB)": 48.14, | |
| "step": 55, | |
| "token_acc": 0.7120900797238432, | |
| "train_speed(iter/s)": 0.136689 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.8315885663032532, | |
| "learning_rate": 4.205153092500805e-06, | |
| "loss": 1.0878031730651856, | |
| "memory(GiB)": 48.14, | |
| "step": 60, | |
| "token_acc": 0.715117320437041, | |
| "train_speed(iter/s)": 0.13697 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.8217139840126038, | |
| "learning_rate": 4.058724504646834e-06, | |
| "loss": 1.0752467155456542, | |
| "memory(GiB)": 48.14, | |
| "step": 65, | |
| "token_acc": 0.7134517984376342, | |
| "train_speed(iter/s)": 0.137088 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.8661554455757141, | |
| "learning_rate": 3.9029676634059565e-06, | |
| "loss": 1.0480419158935548, | |
| "memory(GiB)": 48.14, | |
| "step": 70, | |
| "token_acc": 0.720516548362268, | |
| "train_speed(iter/s)": 0.137508 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7952479124069214, | |
| "learning_rate": 3.738814702248524e-06, | |
| "loss": 1.0351552963256836, | |
| "memory(GiB)": 48.14, | |
| "step": 75, | |
| "token_acc": 0.7191632811824704, | |
| "train_speed(iter/s)": 0.137541 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.7449638247489929, | |
| "learning_rate": 3.5672480015832117e-06, | |
| "loss": 1.005133056640625, | |
| "memory(GiB)": 52.23, | |
| "step": 80, | |
| "token_acc": 0.7209397177583816, | |
| "train_speed(iter/s)": 0.137691 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.7283169627189636, | |
| "learning_rate": 3.3892943096594968e-06, | |
| "loss": 0.9723712921142578, | |
| "memory(GiB)": 52.23, | |
| "step": 85, | |
| "token_acc": 0.7297183690626313, | |
| "train_speed(iter/s)": 0.13788 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.7156486511230469, | |
| "learning_rate": 3.206018597948893e-06, | |
| "loss": 0.9777496337890625, | |
| "memory(GiB)": 52.23, | |
| "step": 90, | |
| "token_acc": 0.7360159362549801, | |
| "train_speed(iter/s)": 0.138013 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.6714794635772705, | |
| "learning_rate": 3.018517687777688e-06, | |
| "loss": 0.9577500343322753, | |
| "memory(GiB)": 52.23, | |
| "step": 95, | |
| "token_acc": 0.7401526717557252, | |
| "train_speed(iter/s)": 0.138017 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.6295478940010071, | |
| "learning_rate": 2.827913686352856e-06, | |
| "loss": 0.9314702987670899, | |
| "memory(GiB)": 52.23, | |
| "step": 100, | |
| "token_acc": 0.7466477222271557, | |
| "train_speed(iter/s)": 0.138222 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6380690336227417, | |
| "learning_rate": 2.6353472714635443e-06, | |
| "loss": 0.9214505195617676, | |
| "memory(GiB)": 52.23, | |
| "step": 105, | |
| "token_acc": 0.7306868047759976, | |
| "train_speed(iter/s)": 0.135794 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 0.6720516085624695, | |
| "learning_rate": 2.441970865046111e-06, | |
| "loss": 0.9319070816040039, | |
| "memory(GiB)": 52.23, | |
| "step": 110, | |
| "token_acc": 0.737565968335199, | |
| "train_speed(iter/s)": 0.13602 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 0.7207827568054199, | |
| "learning_rate": 2.2489417364658194e-06, | |
| "loss": 0.9134335517883301, | |
| "memory(GiB)": 52.23, | |
| "step": 115, | |
| "token_acc": 0.7528385866109547, | |
| "train_speed(iter/s)": 0.136276 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 0.6305469870567322, | |
| "learning_rate": 2.0574150767888795e-06, | |
| "loss": 0.9125921249389648, | |
| "memory(GiB)": 52.23, | |
| "step": 120, | |
| "token_acc": 0.7383805374001452, | |
| "train_speed(iter/s)": 0.136526 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 0.6620354056358337, | |
| "learning_rate": 1.8685370854921631e-06, | |
| "loss": 0.9069900512695312, | |
| "memory(GiB)": 52.23, | |
| "step": 125, | |
| "token_acc": 0.7528492866820754, | |
| "train_speed(iter/s)": 0.136582 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.6394309401512146, | |
| "learning_rate": 1.6834381109834696e-06, | |
| "loss": 0.895142650604248, | |
| "memory(GiB)": 52.23, | |
| "step": 130, | |
| "token_acc": 0.7488820899035067, | |
| "train_speed(iter/s)": 0.136691 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 0.5836830735206604, | |
| "learning_rate": 1.5032258859831916e-06, | |
| "loss": 0.8956113815307617, | |
| "memory(GiB)": 52.23, | |
| "step": 135, | |
| "token_acc": 0.7457598371777476, | |
| "train_speed(iter/s)": 0.13675 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 0.5948861837387085, | |
| "learning_rate": 1.328978898250525e-06, | |
| "loss": 0.8733110427856445, | |
| "memory(GiB)": 52.23, | |
| "step": 140, | |
| "token_acc": 0.7442634138019278, | |
| "train_speed(iter/s)": 0.136942 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 0.6282215118408203, | |
| "learning_rate": 1.1617399363274024e-06, | |
| "loss": 0.8916261672973633, | |
| "memory(GiB)": 52.23, | |
| "step": 145, | |
| "token_acc": 0.7513582342954159, | |
| "train_speed(iter/s)": 0.136986 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 0.5960206389427185, | |
| "learning_rate": 1.0025098489259161e-06, | |
| "loss": 0.8885273933410645, | |
| "memory(GiB)": 52.23, | |
| "step": 150, | |
| "token_acc": 0.7522109400589584, | |
| "train_speed(iter/s)": 0.137046 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.5816349983215332, | |
| "learning_rate": 8.522415553064433e-07, | |
| "loss": 0.8631435394287109, | |
| "memory(GiB)": 52.23, | |
| "step": 155, | |
| "token_acc": 0.7504964171630838, | |
| "train_speed(iter/s)": 0.137186 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 0.5882980823516846, | |
| "learning_rate": 7.118343424916249e-07, | |
| "loss": 0.8616400718688965, | |
| "memory(GiB)": 52.23, | |
| "step": 160, | |
| "token_acc": 0.7499206349206349, | |
| "train_speed(iter/s)": 0.137302 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 0.6468539237976074, | |
| "learning_rate": 5.821284834447586e-07, | |
| "loss": 0.8487722396850585, | |
| "memory(GiB)": 52.23, | |
| "step": 165, | |
| "token_acc": 0.7601775198429632, | |
| "train_speed(iter/s)": 0.137471 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 0.5706282258033752, | |
| "learning_rate": 4.6390020842035755e-07, | |
| "loss": 0.8451438903808594, | |
| "memory(GiB)": 52.23, | |
| "step": 170, | |
| "token_acc": 0.7554143126177024, | |
| "train_speed(iter/s)": 0.137542 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.6457207798957825, | |
| "learning_rate": 3.578570595810274e-07, | |
| "loss": 0.8587837219238281, | |
| "memory(GiB)": 52.23, | |
| "step": 175, | |
| "token_acc": 0.7626084715181717, | |
| "train_speed(iter/s)": 0.137627 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.5656464695930481, | |
| "learning_rate": 2.646336566811686e-07, | |
| "loss": 0.8507623672485352, | |
| "memory(GiB)": 52.23, | |
| "step": 180, | |
| "token_acc": 0.757103655837833, | |
| "train_speed(iter/s)": 0.137634 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 0.5902859568595886, | |
| "learning_rate": 1.847878991579477e-07, | |
| "loss": 0.8512592315673828, | |
| "memory(GiB)": 56.56, | |
| "step": 185, | |
| "token_acc": 0.7558876383240329, | |
| "train_speed(iter/s)": 0.13778 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 0.5729983448982239, | |
| "learning_rate": 1.1879762735828081e-07, | |
| "loss": 0.8604719161987304, | |
| "memory(GiB)": 56.56, | |
| "step": 190, | |
| "token_acc": 0.7497445571013126, | |
| "train_speed(iter/s)": 0.137818 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.5367587208747864, | |
| "learning_rate": 6.705776288286281e-08, | |
| "loss": 0.8579318046569824, | |
| "memory(GiB)": 56.56, | |
| "step": 195, | |
| "token_acc": 0.7590520658972778, | |
| "train_speed(iter/s)": 0.137879 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 0.5742630362510681, | |
| "learning_rate": 2.987794516097875e-08, | |
| "loss": 0.866505241394043, | |
| "memory(GiB)": 56.56, | |
| "step": 200, | |
| "token_acc": 0.7586411889596603, | |
| "train_speed(iter/s)": 0.137909 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.5835118293762207, | |
| "learning_rate": 7.480678400109965e-09, | |
| "loss": 0.8637813568115235, | |
| "memory(GiB)": 56.56, | |
| "step": 205, | |
| "token_acc": 0.7485916626428456, | |
| "train_speed(iter/s)": 0.136252 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6009649634361267, | |
| "learning_rate": 0.0, | |
| "loss": 0.8247391700744628, | |
| "memory(GiB)": 61.37, | |
| "step": 210, | |
| "token_acc": 0.770509227944729, | |
| "train_speed(iter/s)": 0.136863 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 210, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.529829855770706e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |