| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 93, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 20.59375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1623296411707997, | |
| "epoch": 0.010752688172043012, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 2.5920114517211914, | |
| "learning_rate": 0.0, | |
| "loss": -0.0655, | |
| "num_tokens": 209228.0, | |
| "reward": 0.10260416567325592, | |
| "reward_std": 0.12079741060733795, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.01875000074505806, | |
| "rewards/reward_coverage/std": 0.08886408805847168, | |
| "rewards/reward_repetition/mean": 0.08385416865348816, | |
| "rewards/reward_repetition/std": 0.1609596610069275, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5810753107070923, | |
| "sampling/importance_sampling_ratio/min": 8.851574766937428e-15, | |
| "sampling/sampling_logp_difference/max": 32.35818099975586, | |
| "sampling/sampling_logp_difference/mean": 3.2135589122772217, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 19.34375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.2037830371409655, | |
| "epoch": 0.021505376344086023, | |
| "frac_reward_zero_std": 0.65625, | |
| "grad_norm": 1.522868275642395, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "loss": -0.0672, | |
| "num_tokens": 411382.0, | |
| "reward": 0.06171875074505806, | |
| "reward_std": 0.08728349208831787, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0015625000232830644, | |
| "rewards/reward_coverage/std": 0.01250000111758709, | |
| "rewards/reward_repetition/mean": 0.06015624850988388, | |
| "rewards/reward_repetition/std": 0.14286737143993378, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.572687029838562, | |
| "sampling/importance_sampling_ratio/min": 6.514152938275704e-16, | |
| "sampling/sampling_logp_difference/max": 34.967384338378906, | |
| "sampling/sampling_logp_difference/mean": 3.2888572216033936, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 18.015625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1908840090036392, | |
| "epoch": 0.03225806451612903, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 2.1042428016662598, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": -0.0415, | |
| "num_tokens": 589899.0, | |
| "reward": 0.03333333134651184, | |
| "reward_std": 0.03609190881252289, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0, | |
| "rewards/reward_coverage/std": 0.0, | |
| "rewards/reward_repetition/mean": 0.03333333134651184, | |
| "rewards/reward_repetition/std": 0.09172075986862183, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.582237184047699, | |
| "sampling/importance_sampling_ratio/min": 3.4790040246974845e-15, | |
| "sampling/sampling_logp_difference/max": 33.292030334472656, | |
| "sampling/sampling_logp_difference/mean": 3.1163713932037354, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 18.1875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.188672631047666, | |
| "epoch": 0.043010752688172046, | |
| "frac_reward_zero_std": 0.78125, | |
| "grad_norm": 1.6742660999298096, | |
| "learning_rate": 7.5e-07, | |
| "loss": -0.0256, | |
| "num_tokens": 784993.0, | |
| "reward": 0.03932292014360428, | |
| "reward_std": 0.046772170811891556, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0, | |
| "rewards/reward_coverage/std": 0.0, | |
| "rewards/reward_repetition/mean": 0.03932292014360428, | |
| "rewards/reward_repetition/std": 0.10880006849765778, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5760836005210876, | |
| "sampling/importance_sampling_ratio/min": 4.4242851705367583e-14, | |
| "sampling/sampling_logp_difference/max": 30.749082565307617, | |
| "sampling/sampling_logp_difference/mean": 3.1024224758148193, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 19.296875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1812530495226383, | |
| "epoch": 0.053763440860215055, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 3.190187931060791, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": -0.061, | |
| "num_tokens": 994168.0, | |
| "reward": 0.07604166865348816, | |
| "reward_std": 0.10753916203975677, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0062500000931322575, | |
| "rewards/reward_coverage/std": 0.035073623061180115, | |
| "rewards/reward_repetition/mean": 0.06979166716337204, | |
| "rewards/reward_repetition/std": 0.1535550206899643, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5558198690414429, | |
| "sampling/importance_sampling_ratio/min": 1.0327031483188718e-17, | |
| "sampling/sampling_logp_difference/max": 39.11176681518555, | |
| "sampling/sampling_logp_difference/mean": 3.2322075366973877, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 18.40625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1822001421824098, | |
| "epoch": 0.06451612903225806, | |
| "frac_reward_zero_std": 0.71875, | |
| "grad_norm": 2.3724122047424316, | |
| "learning_rate": 1.25e-06, | |
| "loss": -0.0569, | |
| "num_tokens": 1177412.0, | |
| "reward": 0.04401041567325592, | |
| "reward_std": 0.062240131199359894, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0, | |
| "rewards/reward_coverage/std": 0.0, | |
| "rewards/reward_repetition/mean": 0.04401041567325592, | |
| "rewards/reward_repetition/std": 0.11309215426445007, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5901535749435425, | |
| "sampling/importance_sampling_ratio/min": 2.3622067538522137e-15, | |
| "sampling/sampling_logp_difference/max": 33.67918014526367, | |
| "sampling/sampling_logp_difference/mean": 3.0934648513793945, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 20.203125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1859856164082885, | |
| "epoch": 0.07526881720430108, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 2.2745258808135986, | |
| "learning_rate": 1.5e-06, | |
| "loss": -0.07, | |
| "num_tokens": 1379577.0, | |
| "reward": 0.08828125149011612, | |
| "reward_std": 0.10496117174625397, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.02031249925494194, | |
| "rewards/reward_coverage/std": 0.08578246831893921, | |
| "rewards/reward_repetition/mean": 0.06796875596046448, | |
| "rewards/reward_repetition/std": 0.13983187079429626, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5829952359199524, | |
| "sampling/importance_sampling_ratio/min": 1.7831107242271977e-19, | |
| "sampling/sampling_logp_difference/max": 43.17075729370117, | |
| "sampling/sampling_logp_difference/mean": 3.094505786895752, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 19.765625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.1645366102457047, | |
| "epoch": 0.08602150537634409, | |
| "frac_reward_zero_std": 0.71875, | |
| "grad_norm": 2.285911798477173, | |
| "learning_rate": 1.75e-06, | |
| "loss": -0.0463, | |
| "num_tokens": 1562860.0, | |
| "reward": 0.06822916865348816, | |
| "reward_std": 0.05671586096286774, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0, | |
| "rewards/reward_coverage/std": 0.0, | |
| "rewards/reward_repetition/mean": 0.06822916865348816, | |
| "rewards/reward_repetition/std": 0.13447654247283936, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.6059508323669434, | |
| "sampling/importance_sampling_ratio/min": 1.5984405334448e-15, | |
| "sampling/sampling_logp_difference/max": 34.06974792480469, | |
| "sampling/sampling_logp_difference/mean": 3.0470666885375977, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 19.75, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.2150460854172707, | |
| "epoch": 0.0967741935483871, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 2.4473977088928223, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": -0.0729, | |
| "num_tokens": 1750970.0, | |
| "reward": 0.07604166865348816, | |
| "reward_std": 0.10753916203975677, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.004687500186264515, | |
| "rewards/reward_coverage/std": 0.03750000149011612, | |
| "rewards/reward_repetition/mean": 0.07135416567325592, | |
| "rewards/reward_repetition/std": 0.14673006534576416, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.564116358757019, | |
| "sampling/importance_sampling_ratio/min": 5.549738260316673e-19, | |
| "sampling/sampling_logp_difference/max": 42.03536605834961, | |
| "sampling/sampling_logp_difference/mean": 3.24617075920105, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 20.125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 1.1741499826312065, | |
| "epoch": 0.10752688172043011, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 2.0894999504089355, | |
| "learning_rate": 2.25e-06, | |
| "loss": -0.0295, | |
| "num_tokens": 1955314.0, | |
| "reward": 0.08229167759418488, | |
| "reward_std": 0.0766032412648201, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0062500000931322575, | |
| "rewards/reward_coverage/std": 0.05000000447034836, | |
| "rewards/reward_repetition/mean": 0.07604166865348816, | |
| "rewards/reward_repetition/std": 0.12633328139781952, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5676860809326172, | |
| "sampling/importance_sampling_ratio/min": 1.0015881904035955e-15, | |
| "sampling/sampling_logp_difference/max": 34.53718948364258, | |
| "sampling/sampling_logp_difference/mean": 3.318382978439331, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 20.375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.19177012052387, | |
| "epoch": 0.11827956989247312, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 2.127878189086914, | |
| "learning_rate": 2.5e-06, | |
| "loss": -0.044, | |
| "num_tokens": 2152810.0, | |
| "reward": 0.08697916567325592, | |
| "reward_std": 0.09207119792699814, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0, | |
| "rewards/reward_coverage/std": 0.0, | |
| "rewards/reward_repetition/mean": 0.08697916567325592, | |
| "rewards/reward_repetition/std": 0.1668112874031067, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5731910467147827, | |
| "sampling/importance_sampling_ratio/min": 5.365229931202564e-15, | |
| "sampling/sampling_logp_difference/max": 32.85883712768555, | |
| "sampling/sampling_logp_difference/mean": 3.2824923992156982, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 20.640625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 1.2268892796710134, | |
| "epoch": 0.12903225806451613, | |
| "frac_reward_zero_std": 0.53125, | |
| "grad_norm": 2.617131233215332, | |
| "learning_rate": 2.7500000000000004e-06, | |
| "loss": -0.0849, | |
| "num_tokens": 2354987.0, | |
| "reward": 0.10390624403953552, | |
| "reward_std": 0.1211656928062439, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0078125, | |
| "rewards/reward_coverage/std": 0.0625, | |
| "rewards/reward_repetition/mean": 0.09609375149011612, | |
| "rewards/reward_repetition/std": 0.16450294852256775, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5544115900993347, | |
| "sampling/importance_sampling_ratio/min": 7.363439390216395e-18, | |
| "sampling/sampling_logp_difference/max": 39.45000457763672, | |
| "sampling/sampling_logp_difference/mean": 3.3240010738372803, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 34.0, | |
| "completions/mean_length": 23.421875, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.1650395467877388, | |
| "epoch": 0.13978494623655913, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 2.670605182647705, | |
| "learning_rate": 3e-06, | |
| "loss": -0.0758, | |
| "num_tokens": 2616268.0, | |
| "reward": 0.18046876788139343, | |
| "reward_std": 0.17567184567451477, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.04218750074505806, | |
| "rewards/reward_coverage/std": 0.13190266489982605, | |
| "rewards/reward_repetition/mean": 0.13828124105930328, | |
| "rewards/reward_repetition/std": 0.18624858558177948, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5682023167610168, | |
| "sampling/importance_sampling_ratio/min": 1.0665693960122588e-19, | |
| "sampling/sampling_logp_difference/max": 43.684669494628906, | |
| "sampling/sampling_logp_difference/mean": 3.6000094413757324, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 34.0, | |
| "completions/mean_length": 27.984375, | |
| "completions/mean_terminated_length": 34.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.111030412837863, | |
| "epoch": 0.15053763440860216, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 2.47416615486145, | |
| "learning_rate": 3.2500000000000002e-06, | |
| "loss": -0.0662, | |
| "num_tokens": 2906261.0, | |
| "reward": 0.2278645932674408, | |
| "reward_std": 0.17272555828094482, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.03593749925494194, | |
| "rewards/reward_coverage/std": 0.09489709138870239, | |
| "rewards/reward_repetition/mean": 0.19192710518836975, | |
| "rewards/reward_repetition/std": 0.19924886524677277, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.540465772151947, | |
| "sampling/importance_sampling_ratio/min": 4.225438084884613e-17, | |
| "sampling/sampling_logp_difference/max": 37.702823638916016, | |
| "sampling/sampling_logp_difference/mean": 3.798464298248291, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 28.203125, | |
| "completions/mean_terminated_length": 35.5, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.0863127624616027, | |
| "epoch": 0.16129032258064516, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 3.092238664627075, | |
| "learning_rate": 3.5e-06, | |
| "loss": -0.0628, | |
| "num_tokens": 3202938.0, | |
| "reward": 0.30156248807907104, | |
| "reward_std": 0.19813722372055054, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.0390625, | |
| "rewards/reward_coverage/std": 0.10483121871948242, | |
| "rewards/reward_repetition/mean": 0.26249998807907104, | |
| "rewards/reward_repetition/std": 0.20803949236869812, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5526077151298523, | |
| "sampling/importance_sampling_ratio/min": 3.7098329921141575e-20, | |
| "sampling/sampling_logp_difference/max": 44.74071502685547, | |
| "sampling/sampling_logp_difference/mean": 3.7845804691314697, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 30.90625, | |
| "completions/mean_terminated_length": 35.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 1.0002785623073578, | |
| "epoch": 0.17204301075268819, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 2.3450825214385986, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": -0.0623, | |
| "num_tokens": 3523934.0, | |
| "reward": 0.41588544845581055, | |
| "reward_std": 0.2868938446044922, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.08281250298023224, | |
| "rewards/reward_coverage/std": 0.15384459495544434, | |
| "rewards/reward_repetition/mean": 0.3330729007720947, | |
| "rewards/reward_repetition/std": 0.2281644642353058, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.544209361076355, | |
| "sampling/importance_sampling_ratio/min": 1.4981450562263129e-16, | |
| "sampling/sampling_logp_difference/max": 36.4371337890625, | |
| "sampling/sampling_logp_difference/mean": 4.021841526031494, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.90625, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 33.53125, | |
| "completions/mean_terminated_length": 34.333335876464844, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 0.8482576478272676, | |
| "epoch": 0.1827956989247312, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.6775325536727905, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": -0.0187, | |
| "num_tokens": 3875784.0, | |
| "reward": 0.5098958015441895, | |
| "reward_std": 0.20402978360652924, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.13437500596046448, | |
| "rewards/reward_coverage/std": 0.16639859974384308, | |
| "rewards/reward_repetition/mean": 0.37552082538604736, | |
| "rewards/reward_repetition/std": 0.17110413312911987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5547957420349121, | |
| "sampling/importance_sampling_ratio/min": 1.9580492778949622e-20, | |
| "sampling/sampling_logp_difference/max": 45.37975311279297, | |
| "sampling/sampling_logp_difference/mean": 3.881425142288208, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.921875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 35.75, | |
| "completions/mean_terminated_length": 36.400001525878906, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 0.7285963352769613, | |
| "epoch": 0.1935483870967742, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.9497219324111938, | |
| "learning_rate": 4.25e-06, | |
| "loss": -0.0222, | |
| "num_tokens": 4230450.0, | |
| "reward": 0.6015625596046448, | |
| "reward_std": 0.17456699907779694, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.15000000596046448, | |
| "rewards/reward_coverage/std": 0.18856181204319, | |
| "rewards/reward_repetition/mean": 0.4515624940395355, | |
| "rewards/reward_repetition/std": 0.16296739876270294, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5840170383453369, | |
| "sampling/importance_sampling_ratio/min": 1.3230608049300072e-14, | |
| "sampling/sampling_logp_difference/max": 31.95624351501465, | |
| "sampling/sampling_logp_difference/mean": 3.7303450107574463, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.90625, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 35.5, | |
| "completions/mean_terminated_length": 35.333335876464844, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 30.0, | |
| "entropy": 0.7492767116054893, | |
| "epoch": 0.20430107526881722, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 2.027407169342041, | |
| "learning_rate": 4.5e-06, | |
| "loss": -0.0178, | |
| "num_tokens": 4592424.0, | |
| "reward": 0.5973958373069763, | |
| "reward_std": 0.2113954722881317, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.13124999403953552, | |
| "rewards/reward_coverage/std": 0.1670234352350235, | |
| "rewards/reward_repetition/mean": 0.4661458432674408, | |
| "rewards/reward_repetition/std": 0.17673227190971375, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.5936897993087769, | |
| "sampling/importance_sampling_ratio/min": 3.848576383998589e-20, | |
| "sampling/sampling_logp_difference/max": 44.70399856567383, | |
| "sampling/sampling_logp_difference/mean": 3.7467870712280273, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 37.25, | |
| "completions/mean_terminated_length": 36.33333206176758, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 0.6496950350701809, | |
| "epoch": 0.21505376344086022, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.7340210676193237, | |
| "learning_rate": 4.75e-06, | |
| "loss": -0.0028, | |
| "num_tokens": 4973854.0, | |
| "reward": 0.6968749761581421, | |
| "reward_std": 0.1944543719291687, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1875, | |
| "rewards/reward_coverage/std": 0.17320507764816284, | |
| "rewards/reward_repetition/mean": 0.5093749761581421, | |
| "rewards/reward_repetition/std": 0.14407385885715485, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.6289246678352356, | |
| "sampling/importance_sampling_ratio/min": 1.4430105039911333e-19, | |
| "sampling/sampling_logp_difference/max": 43.38238525390625, | |
| "sampling/sampling_logp_difference/mean": 3.522400379180908, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.9375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 37.96875, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.6573502826504409, | |
| "epoch": 0.22580645161290322, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.9576199054718018, | |
| "learning_rate": 5e-06, | |
| "loss": -0.0085, | |
| "num_tokens": 5357700.0, | |
| "reward": 0.715624988079071, | |
| "reward_std": 0.1944543719291687, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.17499999701976776, | |
| "rewards/reward_coverage/std": 0.1736626923084259, | |
| "rewards/reward_repetition/mean": 0.5406249761581421, | |
| "rewards/reward_repetition/std": 0.1649615317583084, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.666456401348114, | |
| "sampling/importance_sampling_ratio/min": 1.55370423422568e-14, | |
| "sampling/sampling_logp_difference/max": 31.795549392700195, | |
| "sampling/sampling_logp_difference/mean": 3.181442975997925, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 38.859375, | |
| "completions/mean_terminated_length": 36.66666793823242, | |
| "completions/min_length": 34.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 0.5784921627491713, | |
| "epoch": 0.23655913978494625, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.6491539478302002, | |
| "learning_rate": 4.931506849315069e-06, | |
| "loss": 0.009, | |
| "num_tokens": 5746525.0, | |
| "reward": 0.792187511920929, | |
| "reward_std": 0.1834058165550232, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.17772118747234344, | |
| "rewards/reward_repetition/mean": 0.5906250476837158, | |
| "rewards/reward_repetition/std": 0.1399759203195572, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.6919029355049133, | |
| "sampling/importance_sampling_ratio/min": 2.5034810497841535e-16, | |
| "sampling/sampling_logp_difference/max": 35.92367935180664, | |
| "sampling/sampling_logp_difference/mean": 3.11592435836792, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.03125, | |
| "completions/mean_terminated_length": 35.5, | |
| "completions/min_length": 32.0, | |
| "completions/min_terminated_length": 34.0, | |
| "entropy": 0.5693019391037524, | |
| "epoch": 0.24731182795698925, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.9411438703536987, | |
| "learning_rate": 4.863013698630138e-06, | |
| "loss": -0.0017, | |
| "num_tokens": 6135539.0, | |
| "reward": 0.859375, | |
| "reward_std": 0.19887377321720123, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.16855332255363464, | |
| "rewards/reward_repetition/mean": 0.6578124761581421, | |
| "rewards/reward_repetition/std": 0.1950211226940155, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.6962894201278687, | |
| "sampling/importance_sampling_ratio/min": 5.379221697881343e-17, | |
| "sampling/sampling_logp_difference/max": 37.461402893066406, | |
| "sampling/sampling_logp_difference/mean": 2.938572645187378, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.921875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.109375, | |
| "completions/mean_terminated_length": 35.20000076293945, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 0.521631199400872, | |
| "epoch": 0.25806451612903225, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 2.231412172317505, | |
| "learning_rate": 4.7945205479452054e-06, | |
| "loss": -0.0117, | |
| "num_tokens": 6524620.0, | |
| "reward": 0.862500011920929, | |
| "reward_std": 0.20329320430755615, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18437500298023224, | |
| "rewards/reward_coverage/std": 0.1801399439573288, | |
| "rewards/reward_repetition/mean": 0.6781250238418579, | |
| "rewards/reward_repetition/std": 0.1656816154718399, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7251441478729248, | |
| "sampling/importance_sampling_ratio/min": 1.906062915100565e-20, | |
| "sampling/sampling_logp_difference/max": 45.40666198730469, | |
| "sampling/sampling_logp_difference/mean": 2.8243846893310547, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.484375, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.46869752556085587, | |
| "epoch": 0.26881720430107525, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 1.5307422876358032, | |
| "learning_rate": 4.726027397260274e-06, | |
| "loss": -0.009, | |
| "num_tokens": 6914737.0, | |
| "reward": 0.9578125476837158, | |
| "reward_std": 0.19224464893341064, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.2109375, | |
| "rewards/reward_coverage/std": 0.16914087533950806, | |
| "rewards/reward_repetition/mean": 0.7468750476837158, | |
| "rewards/reward_repetition/std": 0.1603258103132248, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7427772879600525, | |
| "sampling/importance_sampling_ratio/min": 6.379089188001887e-19, | |
| "sampling/sampling_logp_difference/max": 41.89609146118164, | |
| "sampling/sampling_logp_difference/mean": 2.6809170246124268, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.53125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.44641283014789224, | |
| "epoch": 0.27956989247311825, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.65163254737854, | |
| "learning_rate": 4.657534246575343e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 7305211.0, | |
| "reward": 0.9937499761581421, | |
| "reward_std": 0.18119609355926514, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.171875, | |
| "rewards/reward_coverage/std": 0.15272004902362823, | |
| "rewards/reward_repetition/mean": 0.8218749761581421, | |
| "rewards/reward_repetition/std": 0.15580691397190094, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.742205023765564, | |
| "sampling/importance_sampling_ratio/min": 1.740283318765294e-20, | |
| "sampling/sampling_logp_difference/max": 45.49765396118164, | |
| "sampling/sampling_logp_difference/mean": 2.5815634727478027, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.5625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.39991177897900343, | |
| "epoch": 0.2903225806451613, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.0949409008026123, | |
| "learning_rate": 4.589041095890411e-06, | |
| "loss": -0.0053, | |
| "num_tokens": 7695441.0, | |
| "reward": 1.0, | |
| "reward_std": 0.2121320217847824, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.16874998807907104, | |
| "rewards/reward_coverage/std": 0.1780627816915512, | |
| "rewards/reward_repetition/mean": 0.831250011920929, | |
| "rewards/reward_repetition/std": 0.14786845445632935, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7544648051261902, | |
| "sampling/importance_sampling_ratio/min": 6.676384620995922e-19, | |
| "sampling/sampling_logp_difference/max": 41.85054016113281, | |
| "sampling/sampling_logp_difference/mean": 2.4634246826171875, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.953125, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.5625, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.3994480683468282, | |
| "epoch": 0.3010752688172043, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.35282301902771, | |
| "learning_rate": 4.52054794520548e-06, | |
| "loss": -0.0062, | |
| "num_tokens": 8086007.0, | |
| "reward": 1.015625, | |
| "reward_std": 0.20329320430755615, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21250000596046448, | |
| "rewards/reward_coverage/std": 0.1685606688261032, | |
| "rewards/reward_repetition/mean": 0.8031250238418579, | |
| "rewards/reward_repetition/std": 0.15732762217521667, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7630195617675781, | |
| "sampling/importance_sampling_ratio/min": 6.418834864883299e-17, | |
| "sampling/sampling_logp_difference/max": 37.28470993041992, | |
| "sampling/sampling_logp_difference/mean": 2.45412540435791, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.609375, | |
| "completions/mean_terminated_length": 36.5, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.33232213323935866, | |
| "epoch": 0.3118279569892473, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.4301323890686035, | |
| "learning_rate": 4.4520547945205486e-06, | |
| "loss": -0.0026, | |
| "num_tokens": 8476412.0, | |
| "reward": 1.0406250953674316, | |
| "reward_std": 0.15026018023490906, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21562501788139343, | |
| "rewards/reward_coverage/std": 0.14498905837535858, | |
| "rewards/reward_repetition/mean": 0.824999988079071, | |
| "rewards/reward_repetition/std": 0.14474937319755554, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7893605828285217, | |
| "sampling/importance_sampling_ratio/min": 6.775734351848958e-14, | |
| "sampling/sampling_logp_difference/max": 30.322843551635742, | |
| "sampling/sampling_logp_difference/mean": 2.41098690032959, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.65625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.31409848271869123, | |
| "epoch": 0.3225806451612903, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.2845097780227661, | |
| "learning_rate": 4.383561643835616e-06, | |
| "loss": -0.01, | |
| "num_tokens": 8866724.0, | |
| "reward": 1.0109374523162842, | |
| "reward_std": 0.1480504870414734, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18593749403953552, | |
| "rewards/reward_coverage/std": 0.13076962530612946, | |
| "rewards/reward_repetition/mean": 0.824999988079071, | |
| "rewards/reward_repetition/std": 0.14907118678092957, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7840973734855652, | |
| "sampling/importance_sampling_ratio/min": 1.471832771813246e-16, | |
| "sampling/sampling_logp_difference/max": 36.45485305786133, | |
| "sampling/sampling_logp_difference/mean": 2.4330239295959473, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.796875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.3091448312625289, | |
| "epoch": 0.3333333333333333, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 1.2038891315460205, | |
| "learning_rate": 4.315068493150685e-06, | |
| "loss": -0.0041, | |
| "num_tokens": 9257223.0, | |
| "reward": 1.017187476158142, | |
| "reward_std": 0.20550289750099182, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20468750596046448, | |
| "rewards/reward_coverage/std": 0.1803257167339325, | |
| "rewards/reward_repetition/mean": 0.8125, | |
| "rewards/reward_repetition/std": 0.1374368518590927, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7869740128517151, | |
| "sampling/importance_sampling_ratio/min": 8.298585072157721e-15, | |
| "sampling/sampling_logp_difference/max": 32.422691345214844, | |
| "sampling/sampling_logp_difference/mean": 2.4976119995117188, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.890625, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.2876437115482986, | |
| "epoch": 0.34408602150537637, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.9871541261672974, | |
| "learning_rate": 4.246575342465754e-06, | |
| "loss": 0.0037, | |
| "num_tokens": 9647784.0, | |
| "reward": 1.032812476158142, | |
| "reward_std": 0.17456698417663574, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18281251192092896, | |
| "rewards/reward_coverage/std": 0.15384458005428314, | |
| "rewards/reward_repetition/mean": 0.8500000238418579, | |
| "rewards/reward_repetition/std": 0.1380131095647812, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7916483283042908, | |
| "sampling/importance_sampling_ratio/min": 1.612893541327095e-14, | |
| "sampling/sampling_logp_difference/max": 31.758161544799805, | |
| "sampling/sampling_logp_difference/mean": 2.4329771995544434, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.796875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.27647654921747744, | |
| "epoch": 0.3548387096774194, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.4954354763031006, | |
| "learning_rate": 4.178082191780822e-06, | |
| "loss": -0.0059, | |
| "num_tokens": 10038289.0, | |
| "reward": 1.0218749046325684, | |
| "reward_std": 0.18119610846042633, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20937499403953552, | |
| "rewards/reward_coverage/std": 0.1687665432691574, | |
| "rewards/reward_repetition/mean": 0.8125, | |
| "rewards/reward_repetition/std": 0.12279806286096573, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7887318730354309, | |
| "sampling/importance_sampling_ratio/min": 8.831320397643036e-19, | |
| "sampling/sampling_logp_difference/max": 41.5708122253418, | |
| "sampling/sampling_logp_difference/mean": 2.5059313774108887, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.84375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.27092319959774613, | |
| "epoch": 0.3655913978494624, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.6384416818618774, | |
| "learning_rate": 4.109589041095891e-06, | |
| "loss": -0.0034, | |
| "num_tokens": 10428797.0, | |
| "reward": 1.0593750476837158, | |
| "reward_std": 0.17235726118087769, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.23125001788139343, | |
| "rewards/reward_coverage/std": 0.15314172208309174, | |
| "rewards/reward_repetition/mean": 0.828125, | |
| "rewards/reward_repetition/std": 0.12782520055770874, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7671718597412109, | |
| "sampling/importance_sampling_ratio/min": 6.987482079498287e-19, | |
| "sampling/sampling_logp_difference/max": 41.804996490478516, | |
| "sampling/sampling_logp_difference/mean": 2.6257989406585693, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.8125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2952587741892785, | |
| "epoch": 0.3763440860215054, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.0413151979446411, | |
| "learning_rate": 4.0410958904109595e-06, | |
| "loss": -0.0099, | |
| "num_tokens": 10819301.0, | |
| "reward": 0.9296875596046448, | |
| "reward_std": 0.13921163976192474, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.16093750298023224, | |
| "rewards/reward_coverage/std": 0.12550494074821472, | |
| "rewards/reward_repetition/mean": 0.768750011920929, | |
| "rewards/reward_repetition/std": 0.11391307413578033, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7573365569114685, | |
| "sampling/importance_sampling_ratio/min": 1.5061544475743168e-18, | |
| "sampling/sampling_logp_difference/max": 41.03697204589844, | |
| "sampling/sampling_logp_difference/mean": 2.6861050128936768, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.27068308740854263, | |
| "epoch": 0.3870967741935484, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.277216911315918, | |
| "learning_rate": 3.972602739726027e-06, | |
| "loss": -0.0004, | |
| "num_tokens": 11209548.0, | |
| "reward": 0.989062488079071, | |
| "reward_std": 0.17014756798744202, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21406251192092896, | |
| "rewards/reward_coverage/std": 0.17262418568134308, | |
| "rewards/reward_repetition/mean": 0.7749999761581421, | |
| "rewards/reward_repetition/std": 0.1154700517654419, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7654402256011963, | |
| "sampling/importance_sampling_ratio/min": 4.38677482468737e-14, | |
| "sampling/sampling_logp_difference/max": 30.757596969604492, | |
| "sampling/sampling_logp_difference/mean": 2.659276008605957, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.796875, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.2760939297731966, | |
| "epoch": 0.3978494623655914, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.504520297050476, | |
| "learning_rate": 3.904109589041096e-06, | |
| "loss": -0.0042, | |
| "num_tokens": 11599953.0, | |
| "reward": 0.9546874761581421, | |
| "reward_std": 0.15246990323066711, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.17656250298023224, | |
| "rewards/reward_coverage/std": 0.1318274438381195, | |
| "rewards/reward_repetition/mean": 0.7781250476837158, | |
| "rewards/reward_repetition/std": 0.12404395639896393, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7589582800865173, | |
| "sampling/importance_sampling_ratio/min": 1.2950150141564556e-22, | |
| "sampling/sampling_logp_difference/max": 50.39834976196289, | |
| "sampling/sampling_logp_difference/mean": 2.7161073684692383, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.27736608777195215, | |
| "epoch": 0.40860215053763443, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.6770557165145874, | |
| "learning_rate": 3.8356164383561645e-06, | |
| "loss": -0.0018, | |
| "num_tokens": 11990363.0, | |
| "reward": 0.926562488079071, | |
| "reward_std": 0.14363107085227966, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.17031250894069672, | |
| "rewards/reward_coverage/std": 0.15293914079666138, | |
| "rewards/reward_repetition/mean": 0.7562500238418579, | |
| "rewards/reward_repetition/std": 0.1152980849146843, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7690252661705017, | |
| "sampling/importance_sampling_ratio/min": 5.327883295646056e-16, | |
| "sampling/sampling_logp_difference/max": 35.16840744018555, | |
| "sampling/sampling_logp_difference/mean": 2.6216495037078857, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.8125, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.30535601382143795, | |
| "epoch": 0.41935483870967744, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.455296277999878, | |
| "learning_rate": 3.767123287671233e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 12380581.0, | |
| "reward": 0.948437511920929, | |
| "reward_std": 0.2010834813117981, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1796875, | |
| "rewards/reward_coverage/std": 0.16825877130031586, | |
| "rewards/reward_repetition/mean": 0.7687499523162842, | |
| "rewards/reward_repetition/std": 0.12456272542476654, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7560758590698242, | |
| "sampling/importance_sampling_ratio/min": 1.0004220113236836e-18, | |
| "sampling/sampling_logp_difference/max": 41.446109771728516, | |
| "sampling/sampling_logp_difference/mean": 2.5334110260009766, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.30565810902044177, | |
| "epoch": 0.43010752688172044, | |
| "frac_reward_zero_std": 0.03125, | |
| "grad_norm": 1.2029423713684082, | |
| "learning_rate": 3.6986301369863014e-06, | |
| "loss": -0.0015, | |
| "num_tokens": 12770797.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.19003495573997498, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19062501192092896, | |
| "rewards/reward_coverage/std": 0.15504096448421478, | |
| "rewards/reward_repetition/mean": 0.778124988079071, | |
| "rewards/reward_repetition/std": 0.12404395639896393, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7597452402114868, | |
| "sampling/importance_sampling_ratio/min": 1.379195585862747e-12, | |
| "sampling/sampling_logp_difference/max": 27.309520721435547, | |
| "sampling/sampling_logp_difference/mean": 2.4547033309936523, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 33.0, | |
| "completions/mean_length": 39.6875, | |
| "completions/mean_terminated_length": 33.0, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 0.31394060072489083, | |
| "epoch": 0.44086021505376344, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 1.7512142658233643, | |
| "learning_rate": 3.6301369863013704e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 13161191.0, | |
| "reward": 1.009374976158142, | |
| "reward_std": 0.18561550974845886, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.22499999403953552, | |
| "rewards/reward_coverage/std": 0.15936382114887238, | |
| "rewards/reward_repetition/mean": 0.7843749523162842, | |
| "rewards/reward_repetition/std": 0.13940775394439697, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7532055974006653, | |
| "sampling/importance_sampling_ratio/min": 3.4107995156513595e-17, | |
| "sampling/sampling_logp_difference/max": 37.91699981689453, | |
| "sampling/sampling_logp_difference/mean": 2.571798086166382, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.301532520679757, | |
| "epoch": 0.45161290322580644, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.6585807800292969, | |
| "learning_rate": 3.5616438356164386e-06, | |
| "loss": -0.0042, | |
| "num_tokens": 13551781.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.13258251547813416, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19375000894069672, | |
| "rewards/reward_coverage/std": 0.1562202423810959, | |
| "rewards/reward_repetition/mean": 0.7593749761581421, | |
| "rewards/reward_repetition/std": 0.10796640068292618, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7576152682304382, | |
| "sampling/importance_sampling_ratio/min": 4.25440330295826e-18, | |
| "sampling/sampling_logp_difference/max": 39.99857711791992, | |
| "sampling/sampling_logp_difference/mean": 2.5987966060638428, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2994292816147208, | |
| "epoch": 0.46236559139784944, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.1511462926864624, | |
| "learning_rate": 3.4931506849315072e-06, | |
| "loss": -0.0023, | |
| "num_tokens": 13941920.0, | |
| "reward": 0.9578125476837158, | |
| "reward_std": 0.12595339119434357, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19218748807907104, | |
| "rewards/reward_coverage/std": 0.15045401453971863, | |
| "rewards/reward_repetition/mean": 0.765625, | |
| "rewards/reward_repetition/std": 0.10422617197036743, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7622615694999695, | |
| "sampling/importance_sampling_ratio/min": 1.1524427466063367e-15, | |
| "sampling/sampling_logp_difference/max": 34.39689254760742, | |
| "sampling/sampling_logp_difference/mean": 2.527230978012085, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.9375, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.2939116738270968, | |
| "epoch": 0.4731182795698925, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.2386677265167236, | |
| "learning_rate": 3.4246575342465754e-06, | |
| "loss": -0.0099, | |
| "num_tokens": 14332340.0, | |
| "reward": 0.9515625238418579, | |
| "reward_std": 0.1480504721403122, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19218750298023224, | |
| "rewards/reward_coverage/std": 0.15461647510528564, | |
| "rewards/reward_repetition/mean": 0.7593749761581421, | |
| "rewards/reward_repetition/std": 0.12436345219612122, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7619537711143494, | |
| "sampling/importance_sampling_ratio/min": 3.6812737413604546e-19, | |
| "sampling/sampling_logp_difference/max": 42.445858001708984, | |
| "sampling/sampling_logp_difference/mean": 2.5976765155792236, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.890625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2781213163398206, | |
| "epoch": 0.4838709677419355, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.170350193977356, | |
| "learning_rate": 3.356164383561644e-06, | |
| "loss": -0.0036, | |
| "num_tokens": 14722931.0, | |
| "reward": 0.9953124523162842, | |
| "reward_std": 0.12153396755456924, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1953125, | |
| "rewards/reward_coverage/std": 0.14412261545658112, | |
| "rewards/reward_repetition/mean": 0.800000011920929, | |
| "rewards/reward_repetition/std": 0.08728715777397156, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7608794569969177, | |
| "sampling/importance_sampling_ratio/min": 3.331248850987206e-13, | |
| "sampling/sampling_logp_difference/max": 28.73025894165039, | |
| "sampling/sampling_logp_difference/mean": 2.6023521423339844, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.84375, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.27238480327650905, | |
| "epoch": 0.4946236559139785, | |
| "frac_reward_zero_std": 0.03125, | |
| "grad_norm": 1.1889655590057373, | |
| "learning_rate": 3.2876712328767123e-06, | |
| "loss": -0.005, | |
| "num_tokens": 15113515.0, | |
| "reward": 0.9437500238418579, | |
| "reward_std": 0.17235726118087769, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19062499701976776, | |
| "rewards/reward_coverage/std": 0.14333748817443848, | |
| "rewards/reward_repetition/mean": 0.7531249523162842, | |
| "rewards/reward_repetition/std": 0.09915315359830856, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7437648177146912, | |
| "sampling/importance_sampling_ratio/min": 3.385970521172965e-19, | |
| "sampling/sampling_logp_difference/max": 42.529476165771484, | |
| "sampling/sampling_logp_difference/mean": 2.778402328491211, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.890625, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.24875370506197214, | |
| "epoch": 0.5053763440860215, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.1314892768859863, | |
| "learning_rate": 3.2191780821917813e-06, | |
| "loss": -0.0031, | |
| "num_tokens": 15503912.0, | |
| "reward": 0.942187488079071, | |
| "reward_std": 0.15688931941986084, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18906250596046448, | |
| "rewards/reward_coverage/std": 0.12230224162340164, | |
| "rewards/reward_repetition/mean": 0.7531249523162842, | |
| "rewards/reward_repetition/std": 0.12210943549871445, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7534346580505371, | |
| "sampling/importance_sampling_ratio/min": 3.298192560128319e-15, | |
| "sampling/sampling_logp_difference/max": 33.345401763916016, | |
| "sampling/sampling_logp_difference/mean": 2.7750205993652344, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.875, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.24461835296824574, | |
| "epoch": 0.5161290322580645, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 0.6554945111274719, | |
| "learning_rate": 3.1506849315068495e-06, | |
| "loss": -0.0048, | |
| "num_tokens": 15894138.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.16793785989284515, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21875, | |
| "rewards/reward_coverage/std": 0.13554710149765015, | |
| "rewards/reward_repetition/mean": 0.734375, | |
| "rewards/reward_repetition/std": 0.12372364103794098, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7367614507675171, | |
| "sampling/importance_sampling_ratio/min": 1.2913128950274503e-18, | |
| "sampling/sampling_logp_difference/max": 41.19087219238281, | |
| "sampling/sampling_logp_difference/mean": 2.8969175815582275, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.221066806698218, | |
| "epoch": 0.5268817204301075, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.813700795173645, | |
| "learning_rate": 3.082191780821918e-06, | |
| "loss": -0.0125, | |
| "num_tokens": 16284827.0, | |
| "reward": 0.9203125238418579, | |
| "reward_std": 0.1657281517982483, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18906250596046448, | |
| "rewards/reward_coverage/std": 0.1310727298259735, | |
| "rewards/reward_repetition/mean": 0.7312500476837158, | |
| "rewards/reward_repetition/std": 0.12456272542476654, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7454463839530945, | |
| "sampling/importance_sampling_ratio/min": 2.6984808100466543e-19, | |
| "sampling/sampling_logp_difference/max": 42.75642776489258, | |
| "sampling/sampling_logp_difference/mean": 3.0947470664978027, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2340390719473362, | |
| "epoch": 0.5376344086021505, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.1709717512130737, | |
| "learning_rate": 3.0136986301369864e-06, | |
| "loss": -0.0098, | |
| "num_tokens": 16675257.0, | |
| "reward": 0.9140625596046448, | |
| "reward_std": 0.13479222357273102, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18906250596046448, | |
| "rewards/reward_coverage/std": 0.14155283570289612, | |
| "rewards/reward_repetition/mean": 0.7250000238418579, | |
| "rewards/reward_repetition/std": 0.10983392596244812, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.732937216758728, | |
| "sampling/importance_sampling_ratio/min": 1.6756041496787032e-16, | |
| "sampling/sampling_logp_difference/max": 36.32518768310547, | |
| "sampling/sampling_logp_difference/mean": 3.183100700378418, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 39.0, | |
| "completions/mean_length": 39.78125, | |
| "completions/mean_terminated_length": 39.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 39.0, | |
| "entropy": 0.20713584939949214, | |
| "epoch": 0.5483870967741935, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.8335843086242676, | |
| "learning_rate": 2.945205479452055e-06, | |
| "loss": -0.0031, | |
| "num_tokens": 17065845.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.15026019513607025, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21875, | |
| "rewards/reward_coverage/std": 0.1390158236026764, | |
| "rewards/reward_repetition/mean": 0.734375, | |
| "rewards/reward_repetition/std": 0.10722880065441132, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.739495038986206, | |
| "sampling/importance_sampling_ratio/min": 4.677705163318169e-13, | |
| "sampling/sampling_logp_difference/max": 28.390798568725586, | |
| "sampling/sampling_logp_difference/mean": 3.198160409927368, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.9375, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.19506761734373868, | |
| "epoch": 0.5591397849462365, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.6900437474250793, | |
| "learning_rate": 2.876712328767123e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 17456339.0, | |
| "reward": 0.9359375238418579, | |
| "reward_std": 0.17898640036582947, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20468750596046448, | |
| "rewards/reward_coverage/std": 0.17129728198051453, | |
| "rewards/reward_repetition/mean": 0.731249988079071, | |
| "rewards/reward_repetition/std": 0.09574270248413086, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7362239956855774, | |
| "sampling/importance_sampling_ratio/min": 2.602479520623457e-19, | |
| "sampling/sampling_logp_difference/max": 42.79265213012695, | |
| "sampling/sampling_logp_difference/mean": 3.2387518882751465, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "entropy": 0.1951053044758737, | |
| "epoch": 0.5698924731182796, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.0901892185211182, | |
| "learning_rate": 2.8082191780821922e-06, | |
| "loss": -0.0096, | |
| "num_tokens": 17846929.0, | |
| "reward": 0.9390625357627869, | |
| "reward_std": 0.19224466383457184, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.2109375, | |
| "rewards/reward_coverage/std": 0.17193317413330078, | |
| "rewards/reward_repetition/mean": 0.7281249761581421, | |
| "rewards/reward_repetition/std": 0.10307764261960983, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7550839781761169, | |
| "sampling/importance_sampling_ratio/min": 1.1955911409525077e-18, | |
| "sampling/sampling_logp_difference/max": 41.26789093017578, | |
| "sampling/sampling_logp_difference/mean": 3.1413958072662354, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.765625, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.21364939608611166, | |
| "epoch": 0.5806451612903226, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.0280330181121826, | |
| "learning_rate": 2.7397260273972604e-06, | |
| "loss": -0.0093, | |
| "num_tokens": 18237620.0, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.1414213478565216, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19062501192092896, | |
| "rewards/reward_coverage/std": 0.1540137678384781, | |
| "rewards/reward_repetition/mean": 0.7593749761581421, | |
| "rewards/reward_repetition/std": 0.0885845422744751, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.738981306552887, | |
| "sampling/importance_sampling_ratio/min": 4.2308257591772147e-13, | |
| "sampling/sampling_logp_difference/max": 28.491209030151367, | |
| "sampling/sampling_logp_difference/mean": 3.156113624572754, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.890625, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.24034091946668923, | |
| "epoch": 0.5913978494623656, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 1.18429696559906, | |
| "learning_rate": 2.671232876712329e-06, | |
| "loss": -0.0049, | |
| "num_tokens": 18628129.0, | |
| "reward": 0.8953125476837158, | |
| "reward_std": 0.1657281517982483, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1796875, | |
| "rewards/reward_coverage/std": 0.1299324631690979, | |
| "rewards/reward_repetition/mean": 0.715624988079071, | |
| "rewards/reward_repetition/std": 0.11158134788274765, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7346891760826111, | |
| "sampling/importance_sampling_ratio/min": 1.511831024952892e-13, | |
| "sampling/sampling_logp_difference/max": 29.52028465270996, | |
| "sampling/sampling_logp_difference/mean": 3.2208282947540283, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 33.0, | |
| "completions/mean_length": 39.84375, | |
| "completions/mean_terminated_length": 33.0, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 0.23257427848875523, | |
| "epoch": 0.6021505376344086, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.6966381072998047, | |
| "learning_rate": 2.6027397260273973e-06, | |
| "loss": -0.0088, | |
| "num_tokens": 19018723.0, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.15026018023490906, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19687500596046448, | |
| "rewards/reward_coverage/std": 0.1284446120262146, | |
| "rewards/reward_repetition/mean": 0.737500011920929, | |
| "rewards/reward_repetition/std": 0.106159508228302, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7426254153251648, | |
| "sampling/importance_sampling_ratio/min": 7.579855932368998e-14, | |
| "sampling/sampling_logp_difference/max": 30.210697174072266, | |
| "sampling/sampling_logp_difference/mean": 3.221386194229126, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2281794489827007, | |
| "epoch": 0.6129032258064516, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.7310183048248291, | |
| "learning_rate": 2.534246575342466e-06, | |
| "loss": -0.0083, | |
| "num_tokens": 19409232.0, | |
| "reward": 0.9671875238418579, | |
| "reward_std": 0.14363107085227966, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.22968751192092896, | |
| "rewards/reward_coverage/std": 0.14979319274425507, | |
| "rewards/reward_repetition/mean": 0.737500011920929, | |
| "rewards/reward_repetition/std": 0.10000000149011612, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7459241151809692, | |
| "sampling/importance_sampling_ratio/min": 1.3522516009469616e-19, | |
| "sampling/sampling_logp_difference/max": 43.44734573364258, | |
| "sampling/sampling_logp_difference/mean": 3.226292133331299, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24348380486480892, | |
| "epoch": 0.6236559139784946, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.9412605166435242, | |
| "learning_rate": 2.4657534246575345e-06, | |
| "loss": -0.0063, | |
| "num_tokens": 19799925.0, | |
| "reward": 0.948437511920929, | |
| "reward_std": 0.15246990323066711, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.14637655019760132, | |
| "rewards/reward_repetition/mean": 0.746874988079071, | |
| "rewards/reward_repetition/std": 0.10833332687616348, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7302582263946533, | |
| "sampling/importance_sampling_ratio/min": 1.1974077329752507e-18, | |
| "sampling/sampling_logp_difference/max": 41.26637268066406, | |
| "sampling/sampling_logp_difference/mean": 3.3912689685821533, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.22413912834599614, | |
| "epoch": 0.6344086021505376, | |
| "frac_reward_zero_std": 0.28125, | |
| "grad_norm": 0.6372097730636597, | |
| "learning_rate": 2.3972602739726027e-06, | |
| "loss": -0.0078, | |
| "num_tokens": 20190710.0, | |
| "reward": 0.953125, | |
| "reward_std": 0.11048543453216553, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1875, | |
| "rewards/reward_coverage/std": 0.1278640329837799, | |
| "rewards/reward_repetition/mean": 0.765625, | |
| "rewards/reward_repetition/std": 0.08398554474115372, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7377102971076965, | |
| "sampling/importance_sampling_ratio/min": 5.862870793152246e-21, | |
| "sampling/sampling_logp_difference/max": 46.58564758300781, | |
| "sampling/sampling_logp_difference/mean": 3.426024913787842, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.23593324795365334, | |
| "epoch": 0.6451612903225806, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.7991491556167603, | |
| "learning_rate": 2.3287671232876713e-06, | |
| "loss": -0.0066, | |
| "num_tokens": 20581308.0, | |
| "reward": 0.9515625238418579, | |
| "reward_std": 0.13921165466308594, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18906250596046448, | |
| "rewards/reward_coverage/std": 0.12487097084522247, | |
| "rewards/reward_repetition/mean": 0.7625000476837158, | |
| "rewards/reward_repetition/std": 0.11751393228769302, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7420527935028076, | |
| "sampling/importance_sampling_ratio/min": 1.627108762957747e-23, | |
| "sampling/sampling_logp_difference/max": 52.472652435302734, | |
| "sampling/sampling_logp_difference/mean": 3.326465606689453, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.9375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.25969044235534966, | |
| "epoch": 0.6559139784946236, | |
| "frac_reward_zero_std": 0.03125, | |
| "grad_norm": 0.9682433009147644, | |
| "learning_rate": 2.26027397260274e-06, | |
| "loss": -0.0109, | |
| "num_tokens": 20972092.0, | |
| "reward": 0.9828125238418579, | |
| "reward_std": 0.15688931941986084, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20781250298023224, | |
| "rewards/reward_coverage/std": 0.1336955726146698, | |
| "rewards/reward_repetition/mean": 0.7749999761581421, | |
| "rewards/reward_repetition/std": 0.09759000688791275, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7352744936943054, | |
| "sampling/importance_sampling_ratio/min": 4.3607164320474956e-13, | |
| "sampling/sampling_logp_difference/max": 28.460969924926758, | |
| "sampling/sampling_logp_difference/mean": 3.418292999267578, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.23248756467364728, | |
| "epoch": 0.6666666666666666, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.9272934198379517, | |
| "learning_rate": 2.191780821917808e-06, | |
| "loss": -0.0034, | |
| "num_tokens": 21362873.0, | |
| "reward": 0.9390624761581421, | |
| "reward_std": 0.13921163976192474, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18593750894069672, | |
| "rewards/reward_coverage/std": 0.11800886690616608, | |
| "rewards/reward_repetition/mean": 0.7531249523162842, | |
| "rewards/reward_repetition/std": 0.1053621917963028, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7381278276443481, | |
| "sampling/importance_sampling_ratio/min": 5.011335584784865e-14, | |
| "sampling/sampling_logp_difference/max": 30.624488830566406, | |
| "sampling/sampling_logp_difference/mean": 3.4408388137817383, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.890625, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.2272115428932011, | |
| "epoch": 0.6774193548387096, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.6223781704902649, | |
| "learning_rate": 2.123287671232877e-06, | |
| "loss": -0.0014, | |
| "num_tokens": 21753644.0, | |
| "reward": 0.9937499761581421, | |
| "reward_std": 0.16793787479400635, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20625001192092896, | |
| "rewards/reward_coverage/std": 0.15210169553756714, | |
| "rewards/reward_repetition/mean": 0.7875000238418579, | |
| "rewards/reward_repetition/std": 0.08637312799692154, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7413582801818848, | |
| "sampling/importance_sampling_ratio/min": 8.111444921513807e-17, | |
| "sampling/sampling_logp_difference/max": 37.0506706237793, | |
| "sampling/sampling_logp_difference/mean": 3.607243776321411, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2311963385436684, | |
| "epoch": 0.6881720430107527, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.8859496116638184, | |
| "learning_rate": 2.0547945205479454e-06, | |
| "loss": 0.0046, | |
| "num_tokens": 22144251.0, | |
| "reward": 1.0171875953674316, | |
| "reward_std": 0.15688931941986084, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.2421875, | |
| "rewards/reward_coverage/std": 0.16407963633537292, | |
| "rewards/reward_repetition/mean": 0.7749999761581421, | |
| "rewards/reward_repetition/std": 0.09085135161876678, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7465033531188965, | |
| "sampling/importance_sampling_ratio/min": 4.7572778301925226e-18, | |
| "sampling/sampling_logp_difference/max": 39.88685607910156, | |
| "sampling/sampling_logp_difference/mean": 3.552140951156616, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2426956002600491, | |
| "epoch": 0.6989247311827957, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.269731044769287, | |
| "learning_rate": 1.9863013698630136e-06, | |
| "loss": -0.0092, | |
| "num_tokens": 22535042.0, | |
| "reward": 0.9437500238418579, | |
| "reward_std": 0.16793784499168396, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.16562500596046448, | |
| "rewards/reward_coverage/std": 0.1382644772529602, | |
| "rewards/reward_repetition/mean": 0.778124988079071, | |
| "rewards/reward_repetition/std": 0.10759823769330978, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.745509684085846, | |
| "sampling/importance_sampling_ratio/min": 2.5291580594867795e-16, | |
| "sampling/sampling_logp_difference/max": 35.913475036621094, | |
| "sampling/sampling_logp_difference/mean": 3.580662488937378, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 40.0, | |
| "entropy": 0.25972409872338176, | |
| "epoch": 0.7096774193548387, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.2360777854919434, | |
| "learning_rate": 1.9178082191780823e-06, | |
| "loss": -0.0021, | |
| "num_tokens": 22925826.0, | |
| "reward": 0.989062488079071, | |
| "reward_std": 0.13921163976192474, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.1578548550605774, | |
| "rewards/reward_repetition/mean": 0.7875000238418579, | |
| "rewards/reward_repetition/std": 0.07867958396673203, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.742576003074646, | |
| "sampling/importance_sampling_ratio/min": 3.8009925881678924e-15, | |
| "sampling/sampling_logp_difference/max": 33.203514099121094, | |
| "sampling/sampling_logp_difference/mean": 3.5737202167510986, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.9375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24233098467811942, | |
| "epoch": 0.7204301075268817, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.0064667463302612, | |
| "learning_rate": 1.8493150684931507e-06, | |
| "loss": -0.0074, | |
| "num_tokens": 23316620.0, | |
| "reward": 0.9796874523162842, | |
| "reward_std": 0.16130872070789337, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18281251192092896, | |
| "rewards/reward_coverage/std": 0.13280214369297028, | |
| "rewards/reward_repetition/mean": 0.796875, | |
| "rewards/reward_repetition/std": 0.11542708426713943, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7610887885093689, | |
| "sampling/importance_sampling_ratio/min": 3.512002747491507e-17, | |
| "sampling/sampling_logp_difference/max": 37.887760162353516, | |
| "sampling/sampling_logp_difference/mean": 3.572136878967285, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.26319174305535853, | |
| "epoch": 0.7311827956989247, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.0560470819473267, | |
| "learning_rate": 1.7808219178082193e-06, | |
| "loss": -0.012, | |
| "num_tokens": 23707121.0, | |
| "reward": 0.9937500953674316, | |
| "reward_std": 0.16351842880249023, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21562500298023224, | |
| "rewards/reward_coverage/std": 0.13359349966049194, | |
| "rewards/reward_repetition/mean": 0.778124988079071, | |
| "rewards/reward_repetition/std": 0.12404395639896393, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7442783713340759, | |
| "sampling/importance_sampling_ratio/min": 1.375699243920652e-15, | |
| "sampling/sampling_logp_difference/max": 34.21981430053711, | |
| "sampling/sampling_logp_difference/mean": 3.5864293575286865, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.78125, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.27109498833306134, | |
| "epoch": 0.7419354838709677, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.1270716190338135, | |
| "learning_rate": 1.7123287671232877e-06, | |
| "loss": -0.0065, | |
| "num_tokens": 24097617.0, | |
| "reward": 0.9968750476837158, | |
| "reward_std": 0.1237436830997467, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20937500894069672, | |
| "rewards/reward_coverage/std": 0.1376892626285553, | |
| "rewards/reward_repetition/mean": 0.7875000238418579, | |
| "rewards/reward_repetition/std": 0.09343531727790833, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7473452687263489, | |
| "sampling/importance_sampling_ratio/min": 1.4495503789956396e-16, | |
| "sampling/sampling_logp_difference/max": 36.47010803222656, | |
| "sampling/sampling_logp_difference/mean": 3.4904568195343018, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.26768106454983354, | |
| "epoch": 0.7526881720430108, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.1696867942810059, | |
| "learning_rate": 1.6438356164383561e-06, | |
| "loss": -0.0057, | |
| "num_tokens": 24488387.0, | |
| "reward": 1.0328125953674316, | |
| "reward_std": 0.13921163976192474, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.22031250596046448, | |
| "rewards/reward_coverage/std": 0.13590343296527863, | |
| "rewards/reward_repetition/mean": 0.8125, | |
| "rewards/reward_repetition/std": 0.11198072135448456, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7499436736106873, | |
| "sampling/importance_sampling_ratio/min": 1.4622693992618306e-15, | |
| "sampling/sampling_logp_difference/max": 34.15878677368164, | |
| "sampling/sampling_logp_difference/mean": 3.6728289127349854, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24969360628165305, | |
| "epoch": 0.7634408602150538, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.8783805966377258, | |
| "learning_rate": 1.5753424657534248e-06, | |
| "loss": -0.0031, | |
| "num_tokens": 24879084.0, | |
| "reward": 1.0343749523162842, | |
| "reward_std": 0.1590990126132965, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19062501192092896, | |
| "rewards/reward_coverage/std": 0.13179922103881836, | |
| "rewards/reward_repetition/mean": 0.8437500596046448, | |
| "rewards/reward_repetition/std": 0.10965313017368317, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7588081955909729, | |
| "sampling/importance_sampling_ratio/min": 3.0506860601055286e-14, | |
| "sampling/sampling_logp_difference/max": 31.120824813842773, | |
| "sampling/sampling_logp_difference/mean": 3.584549903869629, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.796875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2575332070700824, | |
| "epoch": 0.7741935483870968, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 0.6952300071716309, | |
| "learning_rate": 1.5068493150684932e-06, | |
| "loss": -0.0028, | |
| "num_tokens": 25269863.0, | |
| "reward": 1.029687523841858, | |
| "reward_std": 0.14363105595111847, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1953125, | |
| "rewards/reward_coverage/std": 0.1396477371454239, | |
| "rewards/reward_repetition/mean": 0.8343750238418579, | |
| "rewards/reward_repetition/std": 0.11014961451292038, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.767298698425293, | |
| "sampling/importance_sampling_ratio/min": 1.6540065300593926e-15, | |
| "sampling/sampling_logp_difference/max": 34.03557586669922, | |
| "sampling/sampling_logp_difference/mean": 3.5412185192108154, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.796875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.27034957450814545, | |
| "epoch": 0.7849462365591398, | |
| "frac_reward_zero_std": 0.28125, | |
| "grad_norm": 1.3305058479309082, | |
| "learning_rate": 1.4383561643835616e-06, | |
| "loss": -0.0086, | |
| "num_tokens": 25660624.0, | |
| "reward": 0.9765625, | |
| "reward_std": 0.1303728073835373, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19218750298023224, | |
| "rewards/reward_coverage/std": 0.11724982410669327, | |
| "rewards/reward_repetition/mean": 0.784375011920929, | |
| "rewards/reward_repetition/std": 0.10269193351268768, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7455124258995056, | |
| "sampling/importance_sampling_ratio/min": 4.9776697520764746e-14, | |
| "sampling/sampling_logp_difference/max": 30.631229400634766, | |
| "sampling/sampling_logp_difference/mean": 3.557706594467163, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 40.0, | |
| "entropy": 0.2672195213381201, | |
| "epoch": 0.7956989247311828, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.368238925933838, | |
| "learning_rate": 1.3698630136986302e-06, | |
| "loss": -0.0091, | |
| "num_tokens": 26051389.0, | |
| "reward": 1.0171875953674316, | |
| "reward_std": 0.15246988832950592, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1953125, | |
| "rewards/reward_coverage/std": 0.1361951231956482, | |
| "rewards/reward_repetition/mean": 0.8218749761581421, | |
| "rewards/reward_repetition/std": 0.10759823024272919, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7550181746482849, | |
| "sampling/importance_sampling_ratio/min": 4.837896576403988e-13, | |
| "sampling/sampling_logp_difference/max": 28.357126235961914, | |
| "sampling/sampling_logp_difference/mean": 3.518388271331787, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2512192933354527, | |
| "epoch": 0.8064516129032258, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.8075931072235107, | |
| "learning_rate": 1.3013698630136986e-06, | |
| "loss": -0.0055, | |
| "num_tokens": 26442164.0, | |
| "reward": 0.9906250238418579, | |
| "reward_std": 0.12816309928894043, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.16562500596046448, | |
| "rewards/reward_coverage/std": 0.12626346945762634, | |
| "rewards/reward_repetition/mean": 0.8250000476837158, | |
| "rewards/reward_repetition/std": 0.09085134416818619, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7599254250526428, | |
| "sampling/importance_sampling_ratio/min": 1.1269083539586222e-12, | |
| "sampling/sampling_logp_difference/max": 27.51154327392578, | |
| "sampling/sampling_logp_difference/mean": 3.5641071796417236, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.25094706076197326, | |
| "epoch": 0.8172043010752689, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.8058338165283203, | |
| "learning_rate": 1.2328767123287673e-06, | |
| "loss": -0.008, | |
| "num_tokens": 26832860.0, | |
| "reward": 1.0093750953674316, | |
| "reward_std": 0.1767766773700714, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19374999403953552, | |
| "rewards/reward_coverage/std": 0.16122055053710938, | |
| "rewards/reward_repetition/mean": 0.815625011920929, | |
| "rewards/reward_repetition/std": 0.10269193351268768, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7578780055046082, | |
| "sampling/importance_sampling_ratio/min": 1.1848823085411635e-15, | |
| "sampling/sampling_logp_difference/max": 34.36913299560547, | |
| "sampling/sampling_logp_difference/mean": 3.489642858505249, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.90625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.26383460965007544, | |
| "epoch": 0.8279569892473119, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.9045315384864807, | |
| "learning_rate": 1.1643835616438357e-06, | |
| "loss": -0.0039, | |
| "num_tokens": 27223636.0, | |
| "reward": 1.032812476158142, | |
| "reward_std": 0.14363105595111847, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.12407395988702774, | |
| "rewards/reward_repetition/mean": 0.831250011920929, | |
| "rewards/reward_repetition/std": 0.12456272542476654, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7593458890914917, | |
| "sampling/importance_sampling_ratio/min": 1.2729455923859382e-14, | |
| "sampling/sampling_logp_difference/max": 31.994857788085938, | |
| "sampling/sampling_logp_difference/mean": 3.625725030899048, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.859375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2579868610482663, | |
| "epoch": 0.8387096774193549, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.0903428792953491, | |
| "learning_rate": 1.095890410958904e-06, | |
| "loss": -0.0077, | |
| "num_tokens": 27614409.0, | |
| "reward": 1.053125023841858, | |
| "reward_std": 0.18119609355926514, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.22187501192092896, | |
| "rewards/reward_coverage/std": 0.15682236850261688, | |
| "rewards/reward_repetition/mean": 0.8312499523162842, | |
| "rewards/reward_repetition/std": 0.12456272542476654, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7564480304718018, | |
| "sampling/importance_sampling_ratio/min": 3.942305127637401e-14, | |
| "sampling/sampling_logp_difference/max": 30.864425659179688, | |
| "sampling/sampling_logp_difference/mean": 3.604686975479126, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24791082250885665, | |
| "epoch": 0.8494623655913979, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 1.090151071548462, | |
| "learning_rate": 1.0273972602739727e-06, | |
| "loss": -0.0028, | |
| "num_tokens": 28005198.0, | |
| "reward": 1.029687523841858, | |
| "reward_std": 0.1480504870414734, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19218750298023224, | |
| "rewards/reward_coverage/std": 0.16837665438652039, | |
| "rewards/reward_repetition/mean": 0.8375000357627869, | |
| "rewards/reward_repetition/std": 0.106159508228302, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.771141767501831, | |
| "sampling/importance_sampling_ratio/min": 3.6129458528665753e-14, | |
| "sampling/sampling_logp_difference/max": 30.95166778564453, | |
| "sampling/sampling_logp_difference/mean": 3.4720849990844727, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.859375, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.25930464873090386, | |
| "epoch": 0.8602150537634409, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.083723545074463, | |
| "learning_rate": 9.589041095890411e-07, | |
| "loss": -0.0051, | |
| "num_tokens": 28395507.0, | |
| "reward": 1.046875, | |
| "reward_std": 0.15026018023490906, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.23125000298023224, | |
| "rewards/reward_coverage/std": 0.14015299081802368, | |
| "rewards/reward_repetition/mean": 0.815625011920929, | |
| "rewards/reward_repetition/std": 0.152459979057312, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7536813020706177, | |
| "sampling/importance_sampling_ratio/min": 9.19962348487624e-14, | |
| "sampling/sampling_logp_difference/max": 30.01702880859375, | |
| "sampling/sampling_logp_difference/mean": 3.546005964279175, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 40.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 40.0, | |
| "entropy": 0.24061511480249465, | |
| "epoch": 0.8709677419354839, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.8584389686584473, | |
| "learning_rate": 8.904109589041097e-07, | |
| "loss": -0.0066, | |
| "num_tokens": 28786289.0, | |
| "reward": 1.0609374046325684, | |
| "reward_std": 0.1303727924823761, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.2109375, | |
| "rewards/reward_coverage/std": 0.15130877494812012, | |
| "rewards/reward_repetition/mean": 0.8500000238418579, | |
| "rewards/reward_repetition/std": 0.10690449178218842, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7686938643455505, | |
| "sampling/importance_sampling_ratio/min": 2.277888706651854e-13, | |
| "sampling/sampling_logp_difference/max": 29.1103572845459, | |
| "sampling/sampling_logp_difference/mean": 3.516740560531616, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 36.0, | |
| "completions/mean_length": 39.84375, | |
| "completions/mean_terminated_length": 36.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "entropy": 0.23985581938177347, | |
| "epoch": 0.8817204301075269, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 0.9327372312545776, | |
| "learning_rate": 8.219178082191781e-07, | |
| "loss": -0.0095, | |
| "num_tokens": 29177061.0, | |
| "reward": 1.0640625953674316, | |
| "reward_std": 0.15688930451869965, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.21718749403953552, | |
| "rewards/reward_coverage/std": 0.13634072244167328, | |
| "rewards/reward_repetition/mean": 0.846875011920929, | |
| "rewards/reward_repetition/std": 0.12210942804813385, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7691041827201843, | |
| "sampling/importance_sampling_ratio/min": 9.13759844699269e-13, | |
| "sampling/sampling_logp_difference/max": 27.721208572387695, | |
| "sampling/sampling_logp_difference/mean": 3.6279892921447754, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 33.0, | |
| "completions/mean_length": 39.609375, | |
| "completions/mean_terminated_length": 33.0, | |
| "completions/min_length": 33.0, | |
| "completions/min_terminated_length": 33.0, | |
| "entropy": 0.2540010770317167, | |
| "epoch": 0.8924731182795699, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.7506739497184753, | |
| "learning_rate": 7.534246575342466e-07, | |
| "loss": -0.0126, | |
| "num_tokens": 29567814.0, | |
| "reward": 1.0328125953674316, | |
| "reward_std": 0.1834058165550232, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156249403953552, | |
| "rewards/reward_coverage/std": 0.13857802748680115, | |
| "rewards/reward_repetition/mean": 0.831250011920929, | |
| "rewards/reward_repetition/std": 0.12456272542476654, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7523236274719238, | |
| "sampling/importance_sampling_ratio/min": 5.4132771128059115e-14, | |
| "sampling/sampling_logp_difference/max": 30.54733657836914, | |
| "sampling/sampling_logp_difference/mean": 3.681413173675537, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.875, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.25169974751770496, | |
| "epoch": 0.9032258064516129, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.8115216493606567, | |
| "learning_rate": 6.849315068493151e-07, | |
| "loss": -0.0139, | |
| "num_tokens": 29958486.0, | |
| "reward": 1.0359375476837158, | |
| "reward_std": 0.13479222357273102, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.18593750894069672, | |
| "rewards/reward_coverage/std": 0.14014413952827454, | |
| "rewards/reward_repetition/mean": 0.8500000238418579, | |
| "rewards/reward_repetition/std": 0.10690449178218842, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.755419909954071, | |
| "sampling/importance_sampling_ratio/min": 5.566194003652804e-14, | |
| "sampling/sampling_logp_difference/max": 30.519479751586914, | |
| "sampling/sampling_logp_difference/mean": 3.5999581813812256, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2387481287587434, | |
| "epoch": 0.9139784946236559, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.7745798826217651, | |
| "learning_rate": 6.164383561643836e-07, | |
| "loss": -0.0112, | |
| "num_tokens": 30349169.0, | |
| "reward": 1.0515624284744263, | |
| "reward_std": 0.1834058165550232, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156250894069672, | |
| "rewards/reward_coverage/std": 0.15274441242218018, | |
| "rewards/reward_repetition/mean": 0.8500000238418579, | |
| "rewards/reward_repetition/std": 0.11818736046552658, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.76320481300354, | |
| "sampling/importance_sampling_ratio/min": 1.7256138737983123e-13, | |
| "sampling/sampling_logp_difference/max": 29.388023376464844, | |
| "sampling/sampling_logp_difference/mean": 3.6501305103302, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.765625, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2610483162570745, | |
| "epoch": 0.9247311827956989, | |
| "frac_reward_zero_std": 0.21875, | |
| "grad_norm": 1.2502846717834473, | |
| "learning_rate": 5.47945205479452e-07, | |
| "loss": -0.0119, | |
| "num_tokens": 30739838.0, | |
| "reward": 1.0515625476837158, | |
| "reward_std": 0.12595339119434357, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20781250298023224, | |
| "rewards/reward_coverage/std": 0.12885908782482147, | |
| "rewards/reward_repetition/mean": 0.84375, | |
| "rewards/reward_repetition/std": 0.11529809236526489, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7513002753257751, | |
| "sampling/importance_sampling_ratio/min": 2.8611465139379705e-14, | |
| "sampling/sampling_logp_difference/max": 31.184968948364258, | |
| "sampling/sampling_logp_difference/mean": 3.6847875118255615, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.23906523222103715, | |
| "epoch": 0.9354838709677419, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.7462826371192932, | |
| "learning_rate": 4.794520547945206e-07, | |
| "loss": -0.0061, | |
| "num_tokens": 31130530.0, | |
| "reward": 1.045312523841858, | |
| "reward_std": 0.1303728073835373, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156250894069672, | |
| "rewards/reward_coverage/std": 0.13391800224781036, | |
| "rewards/reward_repetition/mean": 0.84375, | |
| "rewards/reward_repetition/std": 0.10965313017368317, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7613617777824402, | |
| "sampling/importance_sampling_ratio/min": 9.466830531296155e-13, | |
| "sampling/sampling_logp_difference/max": 27.68581199645996, | |
| "sampling/sampling_logp_difference/mean": 3.6363892555236816, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2408477501012385, | |
| "epoch": 0.946236559139785, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 1.0979928970336914, | |
| "learning_rate": 4.1095890410958903e-07, | |
| "loss": -0.0084, | |
| "num_tokens": 31521313.0, | |
| "reward": 1.0375001430511475, | |
| "reward_std": 0.1590990126132965, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.19062499701976776, | |
| "rewards/reward_coverage/std": 0.14662203192710876, | |
| "rewards/reward_repetition/mean": 0.846875011920929, | |
| "rewards/reward_repetition/std": 0.09915315359830856, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7597459554672241, | |
| "sampling/importance_sampling_ratio/min": 1.093270764716825e-12, | |
| "sampling/sampling_logp_difference/max": 27.541847229003906, | |
| "sampling/sampling_logp_difference/mean": 3.6034116744995117, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 37.0, | |
| "completions/mean_length": 39.859375, | |
| "completions/mean_terminated_length": 37.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 37.0, | |
| "entropy": 0.254688891582191, | |
| "epoch": 0.956989247311828, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 0.8742080926895142, | |
| "learning_rate": 3.4246575342465755e-07, | |
| "loss": -0.0096, | |
| "num_tokens": 31911900.0, | |
| "reward": 1.0859375, | |
| "reward_std": 0.183405801653862, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.2515625059604645, | |
| "rewards/reward_coverage/std": 0.16522441804409027, | |
| "rewards/reward_repetition/mean": 0.8343749642372131, | |
| "rewards/reward_repetition/std": 0.13119566440582275, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7481080889701843, | |
| "sampling/importance_sampling_ratio/min": 1.0778268598939867e-16, | |
| "sampling/sampling_logp_difference/max": 36.766414642333984, | |
| "sampling/sampling_logp_difference/mean": 3.735830307006836, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24119682773016393, | |
| "epoch": 0.967741935483871, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 0.9537080526351929, | |
| "learning_rate": 2.73972602739726e-07, | |
| "loss": -0.0102, | |
| "num_tokens": 32302677.0, | |
| "reward": 1.078125, | |
| "reward_std": 0.15026018023490906, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.22812499105930328, | |
| "rewards/reward_coverage/std": 0.15783129632472992, | |
| "rewards/reward_repetition/mean": 0.8500000238418579, | |
| "rewards/reward_repetition/std": 0.11268723756074905, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7631887793540955, | |
| "sampling/importance_sampling_ratio/min": 4.8462983300891216e-14, | |
| "sampling/sampling_logp_difference/max": 30.657976150512695, | |
| "sampling/sampling_logp_difference/mean": 3.6851108074188232, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.24475648440420628, | |
| "epoch": 0.978494623655914, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 0.890794038772583, | |
| "learning_rate": 2.0547945205479452e-07, | |
| "loss": -0.004, | |
| "num_tokens": 32693369.0, | |
| "reward": 1.0171875953674316, | |
| "reward_std": 0.14363105595111847, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.17343750596046448, | |
| "rewards/reward_coverage/std": 0.13362134993076324, | |
| "rewards/reward_repetition/mean": 0.84375, | |
| "rewards/reward_repetition/std": 0.11529809236526489, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7631818652153015, | |
| "sampling/importance_sampling_ratio/min": 3.372482685910089e-14, | |
| "sampling/sampling_logp_difference/max": 31.02054214477539, | |
| "sampling/sampling_logp_difference/mean": 3.65973162651062, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 40.0, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 40.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2510380311869085, | |
| "epoch": 0.989247311827957, | |
| "frac_reward_zero_std": 0.09375, | |
| "grad_norm": 0.9964897632598877, | |
| "learning_rate": 1.36986301369863e-07, | |
| "loss": -0.0126, | |
| "num_tokens": 33084143.0, | |
| "reward": 1.0359375476837158, | |
| "reward_std": 0.17898640036582947, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.1953125, | |
| "rewards/reward_coverage/std": 0.1385064274072647, | |
| "rewards/reward_repetition/mean": 0.8406250476837158, | |
| "rewards/reward_repetition/std": 0.134186252951622, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.753570556640625, | |
| "sampling/importance_sampling_ratio/min": 6.38058653628466e-15, | |
| "sampling/sampling_logp_difference/max": 32.685516357421875, | |
| "sampling/sampling_logp_difference/mean": 3.6861093044281006, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 1.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 0.0, | |
| "completions/mean_length": 39.953125, | |
| "completions/mean_terminated_length": 0.0, | |
| "completions/min_length": 37.0, | |
| "completions/min_terminated_length": 0.0, | |
| "entropy": 0.2551991257350892, | |
| "epoch": 1.0, | |
| "frac_reward_zero_std": 0.15625, | |
| "grad_norm": 1.1012099981307983, | |
| "learning_rate": 6.84931506849315e-08, | |
| "loss": -0.0126, | |
| "num_tokens": 33474818.0, | |
| "reward": 1.0421874523162842, | |
| "reward_std": 0.15246988832950592, | |
| "rewards/reward_correct/mean": 0.0, | |
| "rewards/reward_correct/std": 0.0, | |
| "rewards/reward_coverage/mean": 0.20156250894069672, | |
| "rewards/reward_coverage/std": 0.1290898472070694, | |
| "rewards/reward_repetition/mean": 0.8406250476837158, | |
| "rewards/reward_repetition/std": 0.1293681114912033, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.7444443702697754, | |
| "sampling/importance_sampling_ratio/min": 2.801043902031508e-13, | |
| "sampling/sampling_logp_difference/max": 28.903614044189453, | |
| "sampling/sampling_logp_difference/mean": 3.683718204498291, | |
| "step": 93 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 93, | |
| "num_input_tokens_seen": 33474818, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |