{ "best_global_step": 25500, "best_metric": 0.21131116151809692, "best_model_checkpoint": "reverse_model/checkpoint-25500", "epoch": 3.7155762785953663, "eval_steps": 500, "global_step": 25500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07285443683520326, "grad_norm": 0.7519411444664001, "learning_rate": 0.00019818228180096167, "loss": 0.974, "step": 500 }, { "epoch": 0.07285443683520326, "eval_loss": 0.6119500398635864, "eval_runtime": 0.5563, "eval_samples_per_second": 179.775, "eval_steps_per_second": 23.371, "step": 500 }, { "epoch": 0.14570887367040652, "grad_norm": 0.8617602586746216, "learning_rate": 0.0001963609208800816, "loss": 0.6234, "step": 1000 }, { "epoch": 0.14570887367040652, "eval_loss": 0.5182287096977234, "eval_runtime": 0.5766, "eval_samples_per_second": 173.435, "eval_steps_per_second": 22.546, "step": 1000 }, { "epoch": 0.2185633105056098, "grad_norm": 0.4657430648803711, "learning_rate": 0.0001945395599592015, "loss": 0.5373, "step": 1500 }, { "epoch": 0.2185633105056098, "eval_loss": 0.4636226296424866, "eval_runtime": 0.5903, "eval_samples_per_second": 169.403, "eval_steps_per_second": 22.022, "step": 1500 }, { "epoch": 0.29141774734081305, "grad_norm": 0.5456737875938416, "learning_rate": 0.00019271819903832145, "loss": 0.4952, "step": 2000 }, { "epoch": 0.29141774734081305, "eval_loss": 0.4364851415157318, "eval_runtime": 0.5613, "eval_samples_per_second": 178.171, "eval_steps_per_second": 23.162, "step": 2000 }, { "epoch": 0.3642721841760163, "grad_norm": 0.8705533146858215, "learning_rate": 0.00019089683811744136, "loss": 0.4634, "step": 2500 }, { "epoch": 0.3642721841760163, "eval_loss": 0.4068869650363922, "eval_runtime": 0.5819, "eval_samples_per_second": 171.863, "eval_steps_per_second": 22.342, "step": 2500 }, { "epoch": 0.4371266210112196, "grad_norm": 0.45825353264808655, "learning_rate": 0.0001890754771965613, "loss": 0.4429, "step": 3000 }, { "epoch": 0.4371266210112196, "eval_loss": 0.395874947309494, "eval_runtime": 0.5825, "eval_samples_per_second": 171.678, "eval_steps_per_second": 22.318, "step": 3000 }, { "epoch": 0.5099810578464229, "grad_norm": 0.5927444100379944, "learning_rate": 0.0001872541162756812, "loss": 0.4223, "step": 3500 }, { "epoch": 0.5099810578464229, "eval_loss": 0.36890122294425964, "eval_runtime": 0.5584, "eval_samples_per_second": 179.096, "eval_steps_per_second": 23.282, "step": 3500 }, { "epoch": 0.5828354946816261, "grad_norm": 0.6521668434143066, "learning_rate": 0.0001854327553548011, "loss": 0.4066, "step": 4000 }, { "epoch": 0.5828354946816261, "eval_loss": 0.3580659031867981, "eval_runtime": 0.5849, "eval_samples_per_second": 170.956, "eval_steps_per_second": 22.224, "step": 4000 }, { "epoch": 0.6556899315168294, "grad_norm": 0.5500112175941467, "learning_rate": 0.00018361139443392105, "loss": 0.3909, "step": 4500 }, { "epoch": 0.6556899315168294, "eval_loss": 0.3474609851837158, "eval_runtime": 0.569, "eval_samples_per_second": 175.738, "eval_steps_per_second": 22.846, "step": 4500 }, { "epoch": 0.7285443683520326, "grad_norm": 0.703709065914154, "learning_rate": 0.00018179003351304095, "loss": 0.3806, "step": 5000 }, { "epoch": 0.7285443683520326, "eval_loss": 0.33880433440208435, "eval_runtime": 0.5577, "eval_samples_per_second": 179.313, "eval_steps_per_second": 23.311, "step": 5000 }, { "epoch": 0.8013988051872359, "grad_norm": 0.568647027015686, "learning_rate": 0.0001799686725921609, "loss": 0.3731, "step": 5500 }, { "epoch": 0.8013988051872359, "eval_loss": 0.33678069710731506, "eval_runtime": 0.5788, "eval_samples_per_second": 172.771, "eval_steps_per_second": 22.46, "step": 5500 }, { "epoch": 0.8742532420224391, "grad_norm": 0.6542627811431885, "learning_rate": 0.00017814731167128077, "loss": 0.3597, "step": 6000 }, { "epoch": 0.8742532420224391, "eval_loss": 0.3265901207923889, "eval_runtime": 0.5582, "eval_samples_per_second": 179.158, "eval_steps_per_second": 23.291, "step": 6000 }, { "epoch": 0.9471076788576425, "grad_norm": 0.6129189729690552, "learning_rate": 0.0001763259507504007, "loss": 0.3555, "step": 6500 }, { "epoch": 0.9471076788576425, "eval_loss": 0.3173937499523163, "eval_runtime": 0.5933, "eval_samples_per_second": 168.543, "eval_steps_per_second": 21.911, "step": 6500 }, { "epoch": 1.0199621156928458, "grad_norm": 0.46865177154541016, "learning_rate": 0.00017450458982952062, "loss": 0.3471, "step": 7000 }, { "epoch": 1.0199621156928458, "eval_loss": 0.3176809549331665, "eval_runtime": 0.5602, "eval_samples_per_second": 178.494, "eval_steps_per_second": 23.204, "step": 7000 }, { "epoch": 1.0928165525280489, "grad_norm": 0.5402314066886902, "learning_rate": 0.00017268322890864055, "loss": 0.3377, "step": 7500 }, { "epoch": 1.0928165525280489, "eval_loss": 0.30362746119499207, "eval_runtime": 0.5623, "eval_samples_per_second": 177.838, "eval_steps_per_second": 23.119, "step": 7500 }, { "epoch": 1.1656709893632522, "grad_norm": 0.4397026300430298, "learning_rate": 0.00017086186798776046, "loss": 0.3327, "step": 8000 }, { "epoch": 1.1656709893632522, "eval_loss": 0.2984870970249176, "eval_runtime": 0.5634, "eval_samples_per_second": 177.483, "eval_steps_per_second": 23.073, "step": 8000 }, { "epoch": 1.2385254261984555, "grad_norm": 0.4877306818962097, "learning_rate": 0.00016904050706688037, "loss": 0.3257, "step": 8500 }, { "epoch": 1.2385254261984555, "eval_loss": 0.29171615839004517, "eval_runtime": 0.579, "eval_samples_per_second": 172.707, "eval_steps_per_second": 22.452, "step": 8500 }, { "epoch": 1.3113798630336588, "grad_norm": 1.0982270240783691, "learning_rate": 0.0001672191461460003, "loss": 0.32, "step": 9000 }, { "epoch": 1.3113798630336588, "eval_loss": 0.2923184633255005, "eval_runtime": 0.558, "eval_samples_per_second": 179.22, "eval_steps_per_second": 23.299, "step": 9000 }, { "epoch": 1.384234299868862, "grad_norm": 0.6584481000900269, "learning_rate": 0.0001653977852251202, "loss": 0.3129, "step": 9500 }, { "epoch": 1.384234299868862, "eval_loss": 0.2869529128074646, "eval_runtime": 0.5651, "eval_samples_per_second": 176.96, "eval_steps_per_second": 23.005, "step": 9500 }, { "epoch": 1.4570887367040652, "grad_norm": 0.5571127533912659, "learning_rate": 0.00016357642430424015, "loss": 0.3178, "step": 10000 }, { "epoch": 1.4570887367040652, "eval_loss": 0.2796230614185333, "eval_runtime": 0.5645, "eval_samples_per_second": 177.159, "eval_steps_per_second": 23.031, "step": 10000 }, { "epoch": 1.5299431735392686, "grad_norm": 0.33073556423187256, "learning_rate": 0.00016175506338336006, "loss": 0.3054, "step": 10500 }, { "epoch": 1.5299431735392686, "eval_loss": 0.27576252818107605, "eval_runtime": 0.5776, "eval_samples_per_second": 173.121, "eval_steps_per_second": 22.506, "step": 10500 }, { "epoch": 1.6027976103744717, "grad_norm": 0.45740246772766113, "learning_rate": 0.00015993370246247996, "loss": 0.307, "step": 11000 }, { "epoch": 1.6027976103744717, "eval_loss": 0.27131548523902893, "eval_runtime": 0.5666, "eval_samples_per_second": 176.496, "eval_steps_per_second": 22.945, "step": 11000 }, { "epoch": 1.6756520472096752, "grad_norm": 0.45748448371887207, "learning_rate": 0.00015811234154159987, "loss": 0.3015, "step": 11500 }, { "epoch": 1.6756520472096752, "eval_loss": 0.26808932423591614, "eval_runtime": 0.5676, "eval_samples_per_second": 176.195, "eval_steps_per_second": 22.905, "step": 11500 }, { "epoch": 1.7485064840448783, "grad_norm": 0.4469503164291382, "learning_rate": 0.0001562909806207198, "loss": 0.301, "step": 12000 }, { "epoch": 1.7485064840448783, "eval_loss": 0.2650498151779175, "eval_runtime": 0.5572, "eval_samples_per_second": 179.466, "eval_steps_per_second": 23.331, "step": 12000 }, { "epoch": 1.8213609208800816, "grad_norm": 0.6140857338905334, "learning_rate": 0.00015446961969983972, "loss": 0.2948, "step": 12500 }, { "epoch": 1.8213609208800816, "eval_loss": 0.2665635645389557, "eval_runtime": 0.5602, "eval_samples_per_second": 178.493, "eval_steps_per_second": 23.204, "step": 12500 }, { "epoch": 1.894215357715285, "grad_norm": 0.4431038200855255, "learning_rate": 0.00015264825877895965, "loss": 0.2929, "step": 13000 }, { "epoch": 1.894215357715285, "eval_loss": 0.26208823919296265, "eval_runtime": 0.562, "eval_samples_per_second": 177.925, "eval_steps_per_second": 23.13, "step": 13000 }, { "epoch": 1.967069794550488, "grad_norm": 0.5034199953079224, "learning_rate": 0.0001508268978580796, "loss": 0.2844, "step": 13500 }, { "epoch": 1.967069794550488, "eval_loss": 0.2574382722377777, "eval_runtime": 0.5819, "eval_samples_per_second": 171.837, "eval_steps_per_second": 22.339, "step": 13500 }, { "epoch": 2.0399242313856916, "grad_norm": 0.5565065741539001, "learning_rate": 0.00014900553693719947, "loss": 0.2807, "step": 14000 }, { "epoch": 2.0399242313856916, "eval_loss": 0.25828686356544495, "eval_runtime": 0.5827, "eval_samples_per_second": 171.603, "eval_steps_per_second": 22.308, "step": 14000 }, { "epoch": 2.1127786682208947, "grad_norm": 0.5862753987312317, "learning_rate": 0.0001471841760163194, "loss": 0.2806, "step": 14500 }, { "epoch": 2.1127786682208947, "eval_loss": 0.2531309425830841, "eval_runtime": 0.5905, "eval_samples_per_second": 169.362, "eval_steps_per_second": 22.017, "step": 14500 }, { "epoch": 2.1856331050560978, "grad_norm": 0.7702651619911194, "learning_rate": 0.0001453628150954393, "loss": 0.2776, "step": 15000 }, { "epoch": 2.1856331050560978, "eval_loss": 0.24827995896339417, "eval_runtime": 0.5626, "eval_samples_per_second": 177.742, "eval_steps_per_second": 23.106, "step": 15000 }, { "epoch": 2.2584875418913013, "grad_norm": 0.4496975541114807, "learning_rate": 0.00014354145417455925, "loss": 0.2729, "step": 15500 }, { "epoch": 2.2584875418913013, "eval_loss": 0.24621199071407318, "eval_runtime": 0.5772, "eval_samples_per_second": 173.237, "eval_steps_per_second": 22.521, "step": 15500 }, { "epoch": 2.3313419787265044, "grad_norm": 0.5896193981170654, "learning_rate": 0.00014172009325367916, "loss": 0.2718, "step": 16000 }, { "epoch": 2.3313419787265044, "eval_loss": 0.24523521959781647, "eval_runtime": 0.5819, "eval_samples_per_second": 171.859, "eval_steps_per_second": 22.342, "step": 16000 }, { "epoch": 2.4041964155617075, "grad_norm": 0.411600798368454, "learning_rate": 0.00013989873233279907, "loss": 0.2718, "step": 16500 }, { "epoch": 2.4041964155617075, "eval_loss": 0.24163128435611725, "eval_runtime": 0.5597, "eval_samples_per_second": 178.657, "eval_steps_per_second": 23.225, "step": 16500 }, { "epoch": 2.477050852396911, "grad_norm": 0.5009840130805969, "learning_rate": 0.00013807737141191897, "loss": 0.2652, "step": 17000 }, { "epoch": 2.477050852396911, "eval_loss": 0.24147550761699677, "eval_runtime": 0.5809, "eval_samples_per_second": 172.154, "eval_steps_per_second": 22.38, "step": 17000 }, { "epoch": 2.549905289232114, "grad_norm": 0.5353007912635803, "learning_rate": 0.0001362560104910389, "loss": 0.2628, "step": 17500 }, { "epoch": 2.549905289232114, "eval_loss": 0.2369464635848999, "eval_runtime": 0.5912, "eval_samples_per_second": 169.158, "eval_steps_per_second": 21.991, "step": 17500 }, { "epoch": 2.6227597260673177, "grad_norm": 0.4573606848716736, "learning_rate": 0.00013443464957015885, "loss": 0.2609, "step": 18000 }, { "epoch": 2.6227597260673177, "eval_loss": 0.23909151554107666, "eval_runtime": 0.5596, "eval_samples_per_second": 178.696, "eval_steps_per_second": 23.23, "step": 18000 }, { "epoch": 2.6956141629025208, "grad_norm": 0.4674642086029053, "learning_rate": 0.00013261328864927875, "loss": 0.257, "step": 18500 }, { "epoch": 2.6956141629025208, "eval_loss": 0.2337809056043625, "eval_runtime": 0.5605, "eval_samples_per_second": 178.402, "eval_steps_per_second": 23.192, "step": 18500 }, { "epoch": 2.768468599737724, "grad_norm": 0.43507474660873413, "learning_rate": 0.00013079192772839866, "loss": 0.2566, "step": 19000 }, { "epoch": 2.768468599737724, "eval_loss": 0.2308950424194336, "eval_runtime": 0.561, "eval_samples_per_second": 178.259, "eval_steps_per_second": 23.174, "step": 19000 }, { "epoch": 2.8413230365729274, "grad_norm": 0.4606495797634125, "learning_rate": 0.00012897056680751857, "loss": 0.2607, "step": 19500 }, { "epoch": 2.8413230365729274, "eval_loss": 0.23102878034114838, "eval_runtime": 0.5745, "eval_samples_per_second": 174.057, "eval_steps_per_second": 22.627, "step": 19500 }, { "epoch": 2.9141774734081305, "grad_norm": 0.686039388179779, "learning_rate": 0.0001271492058866385, "loss": 0.2562, "step": 20000 }, { "epoch": 2.9141774734081305, "eval_loss": 0.22536581754684448, "eval_runtime": 0.5619, "eval_samples_per_second": 177.974, "eval_steps_per_second": 23.137, "step": 20000 }, { "epoch": 2.987031910243334, "grad_norm": 0.48106732964515686, "learning_rate": 0.00012532784496575841, "loss": 0.2524, "step": 20500 }, { "epoch": 2.987031910243334, "eval_loss": 0.22328069806098938, "eval_runtime": 0.562, "eval_samples_per_second": 177.946, "eval_steps_per_second": 23.133, "step": 20500 }, { "epoch": 3.059886347078537, "grad_norm": 0.44265252351760864, "learning_rate": 0.00012350648404487835, "loss": 0.2469, "step": 21000 }, { "epoch": 3.059886347078537, "eval_loss": 0.22362683713436127, "eval_runtime": 0.5835, "eval_samples_per_second": 171.391, "eval_steps_per_second": 22.281, "step": 21000 }, { "epoch": 3.1327407839137402, "grad_norm": 0.4698319435119629, "learning_rate": 0.00012168512312399827, "loss": 0.2521, "step": 21500 }, { "epoch": 3.1327407839137402, "eval_loss": 0.22086407244205475, "eval_runtime": 0.5707, "eval_samples_per_second": 175.226, "eval_steps_per_second": 22.779, "step": 21500 }, { "epoch": 3.2055952207489438, "grad_norm": 0.4953310191631317, "learning_rate": 0.00011986376220311817, "loss": 0.2478, "step": 22000 }, { "epoch": 3.2055952207489438, "eval_loss": 0.22542959451675415, "eval_runtime": 0.5593, "eval_samples_per_second": 178.801, "eval_steps_per_second": 23.244, "step": 22000 }, { "epoch": 3.278449657584147, "grad_norm": 0.921518087387085, "learning_rate": 0.00011804240128223809, "loss": 0.2447, "step": 22500 }, { "epoch": 3.278449657584147, "eval_loss": 0.22003282606601715, "eval_runtime": 0.5616, "eval_samples_per_second": 178.07, "eval_steps_per_second": 23.149, "step": 22500 }, { "epoch": 3.35130409441935, "grad_norm": 0.5857972502708435, "learning_rate": 0.00011622104036135801, "loss": 0.2432, "step": 23000 }, { "epoch": 3.35130409441935, "eval_loss": 0.21950890123844147, "eval_runtime": 0.5587, "eval_samples_per_second": 178.975, "eval_steps_per_second": 23.267, "step": 23000 }, { "epoch": 3.4241585312545535, "grad_norm": 0.5046322345733643, "learning_rate": 0.00011439967944047793, "loss": 0.2394, "step": 23500 }, { "epoch": 3.4241585312545535, "eval_loss": 0.21759502589702606, "eval_runtime": 0.5757, "eval_samples_per_second": 173.711, "eval_steps_per_second": 22.582, "step": 23500 }, { "epoch": 3.4970129680897566, "grad_norm": 0.6441205739974976, "learning_rate": 0.00011257831851959786, "loss": 0.2371, "step": 24000 }, { "epoch": 3.4970129680897566, "eval_loss": 0.21436944603919983, "eval_runtime": 0.5779, "eval_samples_per_second": 173.027, "eval_steps_per_second": 22.494, "step": 24000 }, { "epoch": 3.5698674049249597, "grad_norm": 0.5626524090766907, "learning_rate": 0.00011075695759871776, "loss": 0.2391, "step": 24500 }, { "epoch": 3.5698674049249597, "eval_loss": 0.21467125415802002, "eval_runtime": 0.5829, "eval_samples_per_second": 171.542, "eval_steps_per_second": 22.3, "step": 24500 }, { "epoch": 3.6427218417601632, "grad_norm": 0.6218989491462708, "learning_rate": 0.00010893559667783769, "loss": 0.235, "step": 25000 }, { "epoch": 3.6427218417601632, "eval_loss": 0.21365521848201752, "eval_runtime": 0.5617, "eval_samples_per_second": 178.033, "eval_steps_per_second": 23.144, "step": 25000 }, { "epoch": 3.7155762785953663, "grad_norm": 0.39173460006713867, "learning_rate": 0.00010711423575695761, "loss": 0.2364, "step": 25500 }, { "epoch": 3.7155762785953663, "eval_loss": 0.21131116151809692, "eval_runtime": 0.5528, "eval_samples_per_second": 180.905, "eval_steps_per_second": 23.518, "step": 25500 } ], "logging_steps": 500, "max_steps": 54904, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6902431875072000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }