| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 54.99601593625498, | |
| "eval_steps": 500, | |
| "global_step": 3451, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.9880478087649402, | |
| "grad_norm": 0.2889673113822937, | |
| "learning_rate": 0.0002974798061389337, | |
| "loss": 0.7635, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.9920318725099602, | |
| "grad_norm": 0.34704622626304626, | |
| "learning_rate": 0.0002944264943457189, | |
| "loss": 0.5643, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.99601593625498, | |
| "grad_norm": 0.32337236404418945, | |
| "learning_rate": 0.00029137318255250403, | |
| "loss": 0.47, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.5398977994918823, | |
| "learning_rate": 0.00028831987075928915, | |
| "loss": 0.3844, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 4.98804780876494, | |
| "grad_norm": 0.4496667981147766, | |
| "learning_rate": 0.0002853150242326333, | |
| "loss": 0.3176, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 5.99203187250996, | |
| "grad_norm": 0.4373694062232971, | |
| "learning_rate": 0.0002822617124394184, | |
| "loss": 0.2479, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 6.99601593625498, | |
| "grad_norm": 0.5146955251693726, | |
| "learning_rate": 0.0002792084006462035, | |
| "loss": 0.1915, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.5761932134628296, | |
| "learning_rate": 0.00027615508885298865, | |
| "loss": 0.1465, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 8.98804780876494, | |
| "grad_norm": 0.46838995814323425, | |
| "learning_rate": 0.00027315024232633277, | |
| "loss": 0.1184, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 9.99203187250996, | |
| "grad_norm": 0.39898747205734253, | |
| "learning_rate": 0.0002700969305331179, | |
| "loss": 0.0931, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 10.996015936254981, | |
| "grad_norm": 0.4515402615070343, | |
| "learning_rate": 0.000267043618739903, | |
| "loss": 0.0776, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.2392401546239853, | |
| "learning_rate": 0.0002639903069466882, | |
| "loss": 0.0685, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 12.98804780876494, | |
| "grad_norm": 0.3774527609348297, | |
| "learning_rate": 0.00026098546042003227, | |
| "loss": 0.0622, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 13.99203187250996, | |
| "grad_norm": 0.22206874191761017, | |
| "learning_rate": 0.0002579321486268174, | |
| "loss": 0.0534, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 14.996015936254981, | |
| "grad_norm": 0.25606569647789, | |
| "learning_rate": 0.00025487883683360257, | |
| "loss": 0.0504, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.3752060830593109, | |
| "learning_rate": 0.0002518255250403877, | |
| "loss": 0.0469, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 16.98804780876494, | |
| "grad_norm": 0.3305172920227051, | |
| "learning_rate": 0.0002488206785137318, | |
| "loss": 0.0445, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 17.992031872509962, | |
| "grad_norm": 0.2901972234249115, | |
| "learning_rate": 0.00024576736672051694, | |
| "loss": 0.0418, | |
| "step": 1129 | |
| }, | |
| { | |
| "epoch": 18.99601593625498, | |
| "grad_norm": 0.3044564127922058, | |
| "learning_rate": 0.0002427140549273021, | |
| "loss": 0.0425, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.2603231966495514, | |
| "learning_rate": 0.00023966074313408722, | |
| "loss": 0.0392, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 20.98804780876494, | |
| "grad_norm": 0.2220539152622223, | |
| "learning_rate": 0.00023665589660743132, | |
| "loss": 0.0371, | |
| "step": 1317 | |
| }, | |
| { | |
| "epoch": 21.992031872509962, | |
| "grad_norm": 0.20209918916225433, | |
| "learning_rate": 0.00023360258481421647, | |
| "loss": 0.0335, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 22.99601593625498, | |
| "grad_norm": 0.271361380815506, | |
| "learning_rate": 0.0002305492730210016, | |
| "loss": 0.034, | |
| "step": 1443 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 0.29796579480171204, | |
| "learning_rate": 0.00022749596122778674, | |
| "loss": 0.0334, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 24.98804780876494, | |
| "grad_norm": 0.2501268684864044, | |
| "learning_rate": 0.00022449111470113084, | |
| "loss": 0.0341, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 25.992031872509962, | |
| "grad_norm": 0.24419628083705902, | |
| "learning_rate": 0.00022143780290791596, | |
| "loss": 0.0339, | |
| "step": 1631 | |
| }, | |
| { | |
| "epoch": 26.99601593625498, | |
| "grad_norm": 0.2808271050453186, | |
| "learning_rate": 0.00021838449111470111, | |
| "loss": 0.0342, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 0.2101186364889145, | |
| "learning_rate": 0.00021533117932148624, | |
| "loss": 0.0349, | |
| "step": 1757 | |
| }, | |
| { | |
| "epoch": 28.98804780876494, | |
| "grad_norm": 0.3779149353504181, | |
| "learning_rate": 0.00021232633279483034, | |
| "loss": 0.0338, | |
| "step": 1819 | |
| }, | |
| { | |
| "epoch": 29.992031872509962, | |
| "grad_norm": 0.21218642592430115, | |
| "learning_rate": 0.00020927302100161549, | |
| "loss": 0.0324, | |
| "step": 1882 | |
| }, | |
| { | |
| "epoch": 30.99601593625498, | |
| "grad_norm": 0.19775445759296417, | |
| "learning_rate": 0.0002062197092084006, | |
| "loss": 0.0301, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 0.3647628724575043, | |
| "learning_rate": 0.00020316639741518576, | |
| "loss": 0.0292, | |
| "step": 2008 | |
| }, | |
| { | |
| "epoch": 32.98804780876494, | |
| "grad_norm": 0.146266907453537, | |
| "learning_rate": 0.00020016155088852986, | |
| "loss": 0.0296, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 33.99203187250996, | |
| "grad_norm": 0.17022199928760529, | |
| "learning_rate": 0.00019710823909531498, | |
| "loss": 0.0269, | |
| "step": 2133 | |
| }, | |
| { | |
| "epoch": 34.99601593625498, | |
| "grad_norm": 0.12986156344413757, | |
| "learning_rate": 0.00019405492730210016, | |
| "loss": 0.0257, | |
| "step": 2196 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 0.10818672925233841, | |
| "learning_rate": 0.00019100161550888528, | |
| "loss": 0.0262, | |
| "step": 2259 | |
| }, | |
| { | |
| "epoch": 36.98804780876494, | |
| "grad_norm": 0.33398228883743286, | |
| "learning_rate": 0.0001879967689822294, | |
| "loss": 0.0263, | |
| "step": 2321 | |
| }, | |
| { | |
| "epoch": 37.99203187250996, | |
| "grad_norm": 0.1804308295249939, | |
| "learning_rate": 0.00018494345718901453, | |
| "loss": 0.0259, | |
| "step": 2384 | |
| }, | |
| { | |
| "epoch": 38.99601593625498, | |
| "grad_norm": 0.17181633412837982, | |
| "learning_rate": 0.00018189014539579966, | |
| "loss": 0.0246, | |
| "step": 2447 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.18690842390060425, | |
| "learning_rate": 0.0001788368336025848, | |
| "loss": 0.0237, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 40.98804780876494, | |
| "grad_norm": 0.11291439831256866, | |
| "learning_rate": 0.0001758319870759289, | |
| "loss": 0.0241, | |
| "step": 2572 | |
| }, | |
| { | |
| "epoch": 41.99203187250996, | |
| "grad_norm": 0.14739790558815002, | |
| "learning_rate": 0.00017277867528271403, | |
| "loss": 0.0237, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 42.99601593625498, | |
| "grad_norm": 0.14795717597007751, | |
| "learning_rate": 0.00016972536348949918, | |
| "loss": 0.0222, | |
| "step": 2698 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 0.04227016866207123, | |
| "learning_rate": 0.0001666720516962843, | |
| "loss": 0.0229, | |
| "step": 2761 | |
| }, | |
| { | |
| "epoch": 44.98804780876494, | |
| "grad_norm": 0.09970798343420029, | |
| "learning_rate": 0.0001636672051696284, | |
| "loss": 0.0225, | |
| "step": 2823 | |
| }, | |
| { | |
| "epoch": 45.99203187250996, | |
| "grad_norm": 0.05245927348732948, | |
| "learning_rate": 0.00016061389337641355, | |
| "loss": 0.0224, | |
| "step": 2886 | |
| }, | |
| { | |
| "epoch": 46.99601593625498, | |
| "grad_norm": 0.11409150063991547, | |
| "learning_rate": 0.00015756058158319868, | |
| "loss": 0.0234, | |
| "step": 2949 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 0.20031851530075073, | |
| "learning_rate": 0.00015450726978998383, | |
| "loss": 0.0236, | |
| "step": 3012 | |
| }, | |
| { | |
| "epoch": 48.98804780876494, | |
| "grad_norm": 0.0484083816409111, | |
| "learning_rate": 0.00015150242326332792, | |
| "loss": 0.0227, | |
| "step": 3074 | |
| }, | |
| { | |
| "epoch": 49.99203187250996, | |
| "grad_norm": 0.04373839870095253, | |
| "learning_rate": 0.00014844911147011308, | |
| "loss": 0.0221, | |
| "step": 3137 | |
| }, | |
| { | |
| "epoch": 50.99601593625498, | |
| "grad_norm": 0.04523545131087303, | |
| "learning_rate": 0.0001453957996768982, | |
| "loss": 0.0208, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 0.050686538219451904, | |
| "learning_rate": 0.00014234248788368335, | |
| "loss": 0.0211, | |
| "step": 3263 | |
| }, | |
| { | |
| "epoch": 52.98804780876494, | |
| "grad_norm": 0.10163258761167526, | |
| "learning_rate": 0.00013933764135702745, | |
| "loss": 0.0212, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 53.99203187250996, | |
| "grad_norm": 0.04123036935925484, | |
| "learning_rate": 0.00013628432956381257, | |
| "loss": 0.0212, | |
| "step": 3388 | |
| }, | |
| { | |
| "epoch": 54.99601593625498, | |
| "grad_norm": 0.2797642946243286, | |
| "learning_rate": 0.00013323101777059772, | |
| "loss": 0.0217, | |
| "step": 3451 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 6200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.791394932396073e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |