Qwen3-1.7B-wordle / trainer_state.json
burtenshaw's picture
burtenshaw HF Staff
Upload folder using huggingface_hub
a27cb15 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 93,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 20.59375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1623296411707997,
"epoch": 0.010752688172043012,
"frac_reward_zero_std": 0.5625,
"grad_norm": 2.5920114517211914,
"learning_rate": 0.0,
"loss": -0.0655,
"num_tokens": 209228.0,
"reward": 0.10260416567325592,
"reward_std": 0.12079741060733795,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.01875000074505806,
"rewards/reward_coverage/std": 0.08886408805847168,
"rewards/reward_repetition/mean": 0.08385416865348816,
"rewards/reward_repetition/std": 0.1609596610069275,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5810753107070923,
"sampling/importance_sampling_ratio/min": 8.851574766937428e-15,
"sampling/sampling_logp_difference/max": 32.35818099975586,
"sampling/sampling_logp_difference/mean": 3.2135589122772217,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 19.34375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.2037830371409655,
"epoch": 0.021505376344086023,
"frac_reward_zero_std": 0.65625,
"grad_norm": 1.522868275642395,
"learning_rate": 2.5000000000000004e-07,
"loss": -0.0672,
"num_tokens": 411382.0,
"reward": 0.06171875074505806,
"reward_std": 0.08728349208831787,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0015625000232830644,
"rewards/reward_coverage/std": 0.01250000111758709,
"rewards/reward_repetition/mean": 0.06015624850988388,
"rewards/reward_repetition/std": 0.14286737143993378,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.572687029838562,
"sampling/importance_sampling_ratio/min": 6.514152938275704e-16,
"sampling/sampling_logp_difference/max": 34.967384338378906,
"sampling/sampling_logp_difference/mean": 3.2888572216033936,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 18.015625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1908840090036392,
"epoch": 0.03225806451612903,
"frac_reward_zero_std": 0.78125,
"grad_norm": 2.1042428016662598,
"learning_rate": 5.000000000000001e-07,
"loss": -0.0415,
"num_tokens": 589899.0,
"reward": 0.03333333134651184,
"reward_std": 0.03609190881252289,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0,
"rewards/reward_coverage/std": 0.0,
"rewards/reward_repetition/mean": 0.03333333134651184,
"rewards/reward_repetition/std": 0.09172075986862183,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.582237184047699,
"sampling/importance_sampling_ratio/min": 3.4790040246974845e-15,
"sampling/sampling_logp_difference/max": 33.292030334472656,
"sampling/sampling_logp_difference/mean": 3.1163713932037354,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 18.1875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.188672631047666,
"epoch": 0.043010752688172046,
"frac_reward_zero_std": 0.78125,
"grad_norm": 1.6742660999298096,
"learning_rate": 7.5e-07,
"loss": -0.0256,
"num_tokens": 784993.0,
"reward": 0.03932292014360428,
"reward_std": 0.046772170811891556,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0,
"rewards/reward_coverage/std": 0.0,
"rewards/reward_repetition/mean": 0.03932292014360428,
"rewards/reward_repetition/std": 0.10880006849765778,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5760836005210876,
"sampling/importance_sampling_ratio/min": 4.4242851705367583e-14,
"sampling/sampling_logp_difference/max": 30.749082565307617,
"sampling/sampling_logp_difference/mean": 3.1024224758148193,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 19.296875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1812530495226383,
"epoch": 0.053763440860215055,
"frac_reward_zero_std": 0.625,
"grad_norm": 3.190187931060791,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.061,
"num_tokens": 994168.0,
"reward": 0.07604166865348816,
"reward_std": 0.10753916203975677,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0062500000931322575,
"rewards/reward_coverage/std": 0.035073623061180115,
"rewards/reward_repetition/mean": 0.06979166716337204,
"rewards/reward_repetition/std": 0.1535550206899643,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5558198690414429,
"sampling/importance_sampling_ratio/min": 1.0327031483188718e-17,
"sampling/sampling_logp_difference/max": 39.11176681518555,
"sampling/sampling_logp_difference/mean": 3.2322075366973877,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 18.40625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1822001421824098,
"epoch": 0.06451612903225806,
"frac_reward_zero_std": 0.71875,
"grad_norm": 2.3724122047424316,
"learning_rate": 1.25e-06,
"loss": -0.0569,
"num_tokens": 1177412.0,
"reward": 0.04401041567325592,
"reward_std": 0.062240131199359894,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0,
"rewards/reward_coverage/std": 0.0,
"rewards/reward_repetition/mean": 0.04401041567325592,
"rewards/reward_repetition/std": 0.11309215426445007,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5901535749435425,
"sampling/importance_sampling_ratio/min": 2.3622067538522137e-15,
"sampling/sampling_logp_difference/max": 33.67918014526367,
"sampling/sampling_logp_difference/mean": 3.0934648513793945,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 20.203125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1859856164082885,
"epoch": 0.07526881720430108,
"frac_reward_zero_std": 0.625,
"grad_norm": 2.2745258808135986,
"learning_rate": 1.5e-06,
"loss": -0.07,
"num_tokens": 1379577.0,
"reward": 0.08828125149011612,
"reward_std": 0.10496117174625397,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.02031249925494194,
"rewards/reward_coverage/std": 0.08578246831893921,
"rewards/reward_repetition/mean": 0.06796875596046448,
"rewards/reward_repetition/std": 0.13983187079429626,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5829952359199524,
"sampling/importance_sampling_ratio/min": 1.7831107242271977e-19,
"sampling/sampling_logp_difference/max": 43.17075729370117,
"sampling/sampling_logp_difference/mean": 3.094505786895752,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 19.765625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.1645366102457047,
"epoch": 0.08602150537634409,
"frac_reward_zero_std": 0.71875,
"grad_norm": 2.285911798477173,
"learning_rate": 1.75e-06,
"loss": -0.0463,
"num_tokens": 1562860.0,
"reward": 0.06822916865348816,
"reward_std": 0.05671586096286774,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0,
"rewards/reward_coverage/std": 0.0,
"rewards/reward_repetition/mean": 0.06822916865348816,
"rewards/reward_repetition/std": 0.13447654247283936,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.6059508323669434,
"sampling/importance_sampling_ratio/min": 1.5984405334448e-15,
"sampling/sampling_logp_difference/max": 34.06974792480469,
"sampling/sampling_logp_difference/mean": 3.0470666885375977,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 19.75,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.2150460854172707,
"epoch": 0.0967741935483871,
"frac_reward_zero_std": 0.5625,
"grad_norm": 2.4473977088928223,
"learning_rate": 2.0000000000000003e-06,
"loss": -0.0729,
"num_tokens": 1750970.0,
"reward": 0.07604166865348816,
"reward_std": 0.10753916203975677,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.004687500186264515,
"rewards/reward_coverage/std": 0.03750000149011612,
"rewards/reward_repetition/mean": 0.07135416567325592,
"rewards/reward_repetition/std": 0.14673006534576416,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.564116358757019,
"sampling/importance_sampling_ratio/min": 5.549738260316673e-19,
"sampling/sampling_logp_difference/max": 42.03536605834961,
"sampling/sampling_logp_difference/mean": 3.24617075920105,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 20.125,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.1741499826312065,
"epoch": 0.10752688172043011,
"frac_reward_zero_std": 0.5625,
"grad_norm": 2.0894999504089355,
"learning_rate": 2.25e-06,
"loss": -0.0295,
"num_tokens": 1955314.0,
"reward": 0.08229167759418488,
"reward_std": 0.0766032412648201,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0062500000931322575,
"rewards/reward_coverage/std": 0.05000000447034836,
"rewards/reward_repetition/mean": 0.07604166865348816,
"rewards/reward_repetition/std": 0.12633328139781952,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5676860809326172,
"sampling/importance_sampling_ratio/min": 1.0015881904035955e-15,
"sampling/sampling_logp_difference/max": 34.53718948364258,
"sampling/sampling_logp_difference/mean": 3.318382978439331,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 20.375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.19177012052387,
"epoch": 0.11827956989247312,
"frac_reward_zero_std": 0.625,
"grad_norm": 2.127878189086914,
"learning_rate": 2.5e-06,
"loss": -0.044,
"num_tokens": 2152810.0,
"reward": 0.08697916567325592,
"reward_std": 0.09207119792699814,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0,
"rewards/reward_coverage/std": 0.0,
"rewards/reward_repetition/mean": 0.08697916567325592,
"rewards/reward_repetition/std": 0.1668112874031067,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5731910467147827,
"sampling/importance_sampling_ratio/min": 5.365229931202564e-15,
"sampling/sampling_logp_difference/max": 32.85883712768555,
"sampling/sampling_logp_difference/mean": 3.2824923992156982,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 20.640625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 0.0,
"entropy": 1.2268892796710134,
"epoch": 0.12903225806451613,
"frac_reward_zero_std": 0.53125,
"grad_norm": 2.617131233215332,
"learning_rate": 2.7500000000000004e-06,
"loss": -0.0849,
"num_tokens": 2354987.0,
"reward": 0.10390624403953552,
"reward_std": 0.1211656928062439,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0078125,
"rewards/reward_coverage/std": 0.0625,
"rewards/reward_repetition/mean": 0.09609375149011612,
"rewards/reward_repetition/std": 0.16450294852256775,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5544115900993347,
"sampling/importance_sampling_ratio/min": 7.363439390216395e-18,
"sampling/sampling_logp_difference/max": 39.45000457763672,
"sampling/sampling_logp_difference/mean": 3.3240010738372803,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 34.0,
"completions/mean_length": 23.421875,
"completions/mean_terminated_length": 34.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.1650395467877388,
"epoch": 0.13978494623655913,
"frac_reward_zero_std": 0.375,
"grad_norm": 2.670605182647705,
"learning_rate": 3e-06,
"loss": -0.0758,
"num_tokens": 2616268.0,
"reward": 0.18046876788139343,
"reward_std": 0.17567184567451477,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.04218750074505806,
"rewards/reward_coverage/std": 0.13190266489982605,
"rewards/reward_repetition/mean": 0.13828124105930328,
"rewards/reward_repetition/std": 0.18624858558177948,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5682023167610168,
"sampling/importance_sampling_ratio/min": 1.0665693960122588e-19,
"sampling/sampling_logp_difference/max": 43.684669494628906,
"sampling/sampling_logp_difference/mean": 3.6000094413757324,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 34.0,
"completions/mean_length": 27.984375,
"completions/mean_terminated_length": 34.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.111030412837863,
"epoch": 0.15053763440860216,
"frac_reward_zero_std": 0.21875,
"grad_norm": 2.47416615486145,
"learning_rate": 3.2500000000000002e-06,
"loss": -0.0662,
"num_tokens": 2906261.0,
"reward": 0.2278645932674408,
"reward_std": 0.17272555828094482,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.03593749925494194,
"rewards/reward_coverage/std": 0.09489709138870239,
"rewards/reward_repetition/mean": 0.19192710518836975,
"rewards/reward_repetition/std": 0.19924886524677277,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.540465772151947,
"sampling/importance_sampling_ratio/min": 4.225438084884613e-17,
"sampling/sampling_logp_difference/max": 37.702823638916016,
"sampling/sampling_logp_difference/mean": 3.798464298248291,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 28.203125,
"completions/mean_terminated_length": 35.5,
"completions/min_length": 16.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.0863127624616027,
"epoch": 0.16129032258064516,
"frac_reward_zero_std": 0.15625,
"grad_norm": 3.092238664627075,
"learning_rate": 3.5e-06,
"loss": -0.0628,
"num_tokens": 3202938.0,
"reward": 0.30156248807907104,
"reward_std": 0.19813722372055054,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.0390625,
"rewards/reward_coverage/std": 0.10483121871948242,
"rewards/reward_repetition/mean": 0.26249998807907104,
"rewards/reward_repetition/std": 0.20803949236869812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5526077151298523,
"sampling/importance_sampling_ratio/min": 3.7098329921141575e-20,
"sampling/sampling_logp_difference/max": 44.74071502685547,
"sampling/sampling_logp_difference/mean": 3.7845804691314697,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 30.90625,
"completions/mean_terminated_length": 35.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.0002785623073578,
"epoch": 0.17204301075268819,
"frac_reward_zero_std": 0.09375,
"grad_norm": 2.3450825214385986,
"learning_rate": 3.7500000000000005e-06,
"loss": -0.0623,
"num_tokens": 3523934.0,
"reward": 0.41588544845581055,
"reward_std": 0.2868938446044922,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.08281250298023224,
"rewards/reward_coverage/std": 0.15384459495544434,
"rewards/reward_repetition/mean": 0.3330729007720947,
"rewards/reward_repetition/std": 0.2281644642353058,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.544209361076355,
"sampling/importance_sampling_ratio/min": 1.4981450562263129e-16,
"sampling/sampling_logp_difference/max": 36.4371337890625,
"sampling/sampling_logp_difference/mean": 4.021841526031494,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 33.53125,
"completions/mean_terminated_length": 34.333335876464844,
"completions/min_length": 16.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.8482576478272676,
"epoch": 0.1827956989247312,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.6775325536727905,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0187,
"num_tokens": 3875784.0,
"reward": 0.5098958015441895,
"reward_std": 0.20402978360652924,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.13437500596046448,
"rewards/reward_coverage/std": 0.16639859974384308,
"rewards/reward_repetition/mean": 0.37552082538604736,
"rewards/reward_repetition/std": 0.17110413312911987,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5547957420349121,
"sampling/importance_sampling_ratio/min": 1.9580492778949622e-20,
"sampling/sampling_logp_difference/max": 45.37975311279297,
"sampling/sampling_logp_difference/mean": 3.881425142288208,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 35.75,
"completions/mean_terminated_length": 36.400001525878906,
"completions/min_length": 16.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.7285963352769613,
"epoch": 0.1935483870967742,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.9497219324111938,
"learning_rate": 4.25e-06,
"loss": -0.0222,
"num_tokens": 4230450.0,
"reward": 0.6015625596046448,
"reward_std": 0.17456699907779694,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.15000000596046448,
"rewards/reward_coverage/std": 0.18856181204319,
"rewards/reward_repetition/mean": 0.4515624940395355,
"rewards/reward_repetition/std": 0.16296739876270294,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5840170383453369,
"sampling/importance_sampling_ratio/min": 1.3230608049300072e-14,
"sampling/sampling_logp_difference/max": 31.95624351501465,
"sampling/sampling_logp_difference/mean": 3.7303450107574463,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 35.5,
"completions/mean_terminated_length": 35.333335876464844,
"completions/min_length": 16.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.7492767116054893,
"epoch": 0.20430107526881722,
"frac_reward_zero_std": 0.15625,
"grad_norm": 2.027407169342041,
"learning_rate": 4.5e-06,
"loss": -0.0178,
"num_tokens": 4592424.0,
"reward": 0.5973958373069763,
"reward_std": 0.2113954722881317,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.13124999403953552,
"rewards/reward_coverage/std": 0.1670234352350235,
"rewards/reward_repetition/mean": 0.4661458432674408,
"rewards/reward_repetition/std": 0.17673227190971375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.5936897993087769,
"sampling/importance_sampling_ratio/min": 3.848576383998589e-20,
"sampling/sampling_logp_difference/max": 44.70399856567383,
"sampling/sampling_logp_difference/mean": 3.7467870712280273,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 37.25,
"completions/mean_terminated_length": 36.33333206176758,
"completions/min_length": 24.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.6496950350701809,
"epoch": 0.21505376344086022,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.7340210676193237,
"learning_rate": 4.75e-06,
"loss": -0.0028,
"num_tokens": 4973854.0,
"reward": 0.6968749761581421,
"reward_std": 0.1944543719291687,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1875,
"rewards/reward_coverage/std": 0.17320507764816284,
"rewards/reward_repetition/mean": 0.5093749761581421,
"rewards/reward_repetition/std": 0.14407385885715485,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.6289246678352356,
"sampling/importance_sampling_ratio/min": 1.4430105039911333e-19,
"sampling/sampling_logp_difference/max": 43.38238525390625,
"sampling/sampling_logp_difference/mean": 3.522400379180908,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 37.96875,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 16.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.6573502826504409,
"epoch": 0.22580645161290322,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.9576199054718018,
"learning_rate": 5e-06,
"loss": -0.0085,
"num_tokens": 5357700.0,
"reward": 0.715624988079071,
"reward_std": 0.1944543719291687,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.17499999701976776,
"rewards/reward_coverage/std": 0.1736626923084259,
"rewards/reward_repetition/mean": 0.5406249761581421,
"rewards/reward_repetition/std": 0.1649615317583084,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.666456401348114,
"sampling/importance_sampling_ratio/min": 1.55370423422568e-14,
"sampling/sampling_logp_difference/max": 31.795549392700195,
"sampling/sampling_logp_difference/mean": 3.181442975997925,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 38.859375,
"completions/mean_terminated_length": 36.66666793823242,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.5784921627491713,
"epoch": 0.23655913978494625,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.6491539478302002,
"learning_rate": 4.931506849315069e-06,
"loss": 0.009,
"num_tokens": 5746525.0,
"reward": 0.792187511920929,
"reward_std": 0.1834058165550232,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.17772118747234344,
"rewards/reward_repetition/mean": 0.5906250476837158,
"rewards/reward_repetition/std": 0.1399759203195572,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.6919029355049133,
"sampling/importance_sampling_ratio/min": 2.5034810497841535e-16,
"sampling/sampling_logp_difference/max": 35.92367935180664,
"sampling/sampling_logp_difference/mean": 3.11592435836792,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.03125,
"completions/mean_terminated_length": 35.5,
"completions/min_length": 32.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.5693019391037524,
"epoch": 0.24731182795698925,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.9411438703536987,
"learning_rate": 4.863013698630138e-06,
"loss": -0.0017,
"num_tokens": 6135539.0,
"reward": 0.859375,
"reward_std": 0.19887377321720123,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.16855332255363464,
"rewards/reward_repetition/mean": 0.6578124761581421,
"rewards/reward_repetition/std": 0.1950211226940155,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.6962894201278687,
"sampling/importance_sampling_ratio/min": 5.379221697881343e-17,
"sampling/sampling_logp_difference/max": 37.461402893066406,
"sampling/sampling_logp_difference/mean": 2.938572645187378,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.109375,
"completions/mean_terminated_length": 35.20000076293945,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.521631199400872,
"epoch": 0.25806451612903225,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.231412172317505,
"learning_rate": 4.7945205479452054e-06,
"loss": -0.0117,
"num_tokens": 6524620.0,
"reward": 0.862500011920929,
"reward_std": 0.20329320430755615,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18437500298023224,
"rewards/reward_coverage/std": 0.1801399439573288,
"rewards/reward_repetition/mean": 0.6781250238418579,
"rewards/reward_repetition/std": 0.1656816154718399,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7251441478729248,
"sampling/importance_sampling_ratio/min": 1.906062915100565e-20,
"sampling/sampling_logp_difference/max": 45.40666198730469,
"sampling/sampling_logp_difference/mean": 2.8243846893310547,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.484375,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.46869752556085587,
"epoch": 0.26881720430107525,
"frac_reward_zero_std": 0.0625,
"grad_norm": 1.5307422876358032,
"learning_rate": 4.726027397260274e-06,
"loss": -0.009,
"num_tokens": 6914737.0,
"reward": 0.9578125476837158,
"reward_std": 0.19224464893341064,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.2109375,
"rewards/reward_coverage/std": 0.16914087533950806,
"rewards/reward_repetition/mean": 0.7468750476837158,
"rewards/reward_repetition/std": 0.1603258103132248,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7427772879600525,
"sampling/importance_sampling_ratio/min": 6.379089188001887e-19,
"sampling/sampling_logp_difference/max": 41.89609146118164,
"sampling/sampling_logp_difference/mean": 2.6809170246124268,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.53125,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.44641283014789224,
"epoch": 0.27956989247311825,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.65163254737854,
"learning_rate": 4.657534246575343e-06,
"loss": -0.0002,
"num_tokens": 7305211.0,
"reward": 0.9937499761581421,
"reward_std": 0.18119609355926514,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.171875,
"rewards/reward_coverage/std": 0.15272004902362823,
"rewards/reward_repetition/mean": 0.8218749761581421,
"rewards/reward_repetition/std": 0.15580691397190094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.742205023765564,
"sampling/importance_sampling_ratio/min": 1.740283318765294e-20,
"sampling/sampling_logp_difference/max": 45.49765396118164,
"sampling/sampling_logp_difference/mean": 2.5815634727478027,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.5625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.39991177897900343,
"epoch": 0.2903225806451613,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.0949409008026123,
"learning_rate": 4.589041095890411e-06,
"loss": -0.0053,
"num_tokens": 7695441.0,
"reward": 1.0,
"reward_std": 0.2121320217847824,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.16874998807907104,
"rewards/reward_coverage/std": 0.1780627816915512,
"rewards/reward_repetition/mean": 0.831250011920929,
"rewards/reward_repetition/std": 0.14786845445632935,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7544648051261902,
"sampling/importance_sampling_ratio/min": 6.676384620995922e-19,
"sampling/sampling_logp_difference/max": 41.85054016113281,
"sampling/sampling_logp_difference/mean": 2.4634246826171875,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.5625,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.3994480683468282,
"epoch": 0.3010752688172043,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.35282301902771,
"learning_rate": 4.52054794520548e-06,
"loss": -0.0062,
"num_tokens": 8086007.0,
"reward": 1.015625,
"reward_std": 0.20329320430755615,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21250000596046448,
"rewards/reward_coverage/std": 0.1685606688261032,
"rewards/reward_repetition/mean": 0.8031250238418579,
"rewards/reward_repetition/std": 0.15732762217521667,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7630195617675781,
"sampling/importance_sampling_ratio/min": 6.418834864883299e-17,
"sampling/sampling_logp_difference/max": 37.28470993041992,
"sampling/sampling_logp_difference/mean": 2.45412540435791,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.609375,
"completions/mean_terminated_length": 36.5,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.33232213323935866,
"epoch": 0.3118279569892473,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.4301323890686035,
"learning_rate": 4.4520547945205486e-06,
"loss": -0.0026,
"num_tokens": 8476412.0,
"reward": 1.0406250953674316,
"reward_std": 0.15026018023490906,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21562501788139343,
"rewards/reward_coverage/std": 0.14498905837535858,
"rewards/reward_repetition/mean": 0.824999988079071,
"rewards/reward_repetition/std": 0.14474937319755554,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7893605828285217,
"sampling/importance_sampling_ratio/min": 6.775734351848958e-14,
"sampling/sampling_logp_difference/max": 30.322843551635742,
"sampling/sampling_logp_difference/mean": 2.41098690032959,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.65625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.31409848271869123,
"epoch": 0.3225806451612903,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2845097780227661,
"learning_rate": 4.383561643835616e-06,
"loss": -0.01,
"num_tokens": 8866724.0,
"reward": 1.0109374523162842,
"reward_std": 0.1480504870414734,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18593749403953552,
"rewards/reward_coverage/std": 0.13076962530612946,
"rewards/reward_repetition/mean": 0.824999988079071,
"rewards/reward_repetition/std": 0.14907118678092957,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7840973734855652,
"sampling/importance_sampling_ratio/min": 1.471832771813246e-16,
"sampling/sampling_logp_difference/max": 36.45485305786133,
"sampling/sampling_logp_difference/mean": 2.4330239295959473,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.796875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 33.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.3091448312625289,
"epoch": 0.3333333333333333,
"frac_reward_zero_std": 0.0625,
"grad_norm": 1.2038891315460205,
"learning_rate": 4.315068493150685e-06,
"loss": -0.0041,
"num_tokens": 9257223.0,
"reward": 1.017187476158142,
"reward_std": 0.20550289750099182,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20468750596046448,
"rewards/reward_coverage/std": 0.1803257167339325,
"rewards/reward_repetition/mean": 0.8125,
"rewards/reward_repetition/std": 0.1374368518590927,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7869740128517151,
"sampling/importance_sampling_ratio/min": 8.298585072157721e-15,
"sampling/sampling_logp_difference/max": 32.422691345214844,
"sampling/sampling_logp_difference/mean": 2.4976119995117188,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.890625,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.2876437115482986,
"epoch": 0.34408602150537637,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.9871541261672974,
"learning_rate": 4.246575342465754e-06,
"loss": 0.0037,
"num_tokens": 9647784.0,
"reward": 1.032812476158142,
"reward_std": 0.17456698417663574,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18281251192092896,
"rewards/reward_coverage/std": 0.15384458005428314,
"rewards/reward_repetition/mean": 0.8500000238418579,
"rewards/reward_repetition/std": 0.1380131095647812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7916483283042908,
"sampling/importance_sampling_ratio/min": 1.612893541327095e-14,
"sampling/sampling_logp_difference/max": 31.758161544799805,
"sampling/sampling_logp_difference/mean": 2.4329771995544434,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.796875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.27647654921747744,
"epoch": 0.3548387096774194,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.4954354763031006,
"learning_rate": 4.178082191780822e-06,
"loss": -0.0059,
"num_tokens": 10038289.0,
"reward": 1.0218749046325684,
"reward_std": 0.18119610846042633,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20937499403953552,
"rewards/reward_coverage/std": 0.1687665432691574,
"rewards/reward_repetition/mean": 0.8125,
"rewards/reward_repetition/std": 0.12279806286096573,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7887318730354309,
"sampling/importance_sampling_ratio/min": 8.831320397643036e-19,
"sampling/sampling_logp_difference/max": 41.5708122253418,
"sampling/sampling_logp_difference/mean": 2.5059313774108887,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.84375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.27092319959774613,
"epoch": 0.3655913978494624,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.6384416818618774,
"learning_rate": 4.109589041095891e-06,
"loss": -0.0034,
"num_tokens": 10428797.0,
"reward": 1.0593750476837158,
"reward_std": 0.17235726118087769,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.23125001788139343,
"rewards/reward_coverage/std": 0.15314172208309174,
"rewards/reward_repetition/mean": 0.828125,
"rewards/reward_repetition/std": 0.12782520055770874,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7671718597412109,
"sampling/importance_sampling_ratio/min": 6.987482079498287e-19,
"sampling/sampling_logp_difference/max": 41.804996490478516,
"sampling/sampling_logp_difference/mean": 2.6257989406585693,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.8125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2952587741892785,
"epoch": 0.3763440860215054,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.0413151979446411,
"learning_rate": 4.0410958904109595e-06,
"loss": -0.0099,
"num_tokens": 10819301.0,
"reward": 0.9296875596046448,
"reward_std": 0.13921163976192474,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.16093750298023224,
"rewards/reward_coverage/std": 0.12550494074821472,
"rewards/reward_repetition/mean": 0.768750011920929,
"rewards/reward_repetition/std": 0.11391307413578033,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7573365569114685,
"sampling/importance_sampling_ratio/min": 1.5061544475743168e-18,
"sampling/sampling_logp_difference/max": 41.03697204589844,
"sampling/sampling_logp_difference/mean": 2.6861050128936768,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.27068308740854263,
"epoch": 0.3870967741935484,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.277216911315918,
"learning_rate": 3.972602739726027e-06,
"loss": -0.0004,
"num_tokens": 11209548.0,
"reward": 0.989062488079071,
"reward_std": 0.17014756798744202,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21406251192092896,
"rewards/reward_coverage/std": 0.17262418568134308,
"rewards/reward_repetition/mean": 0.7749999761581421,
"rewards/reward_repetition/std": 0.1154700517654419,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7654402256011963,
"sampling/importance_sampling_ratio/min": 4.38677482468737e-14,
"sampling/sampling_logp_difference/max": 30.757596969604492,
"sampling/sampling_logp_difference/mean": 2.659276008605957,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.796875,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.2760939297731966,
"epoch": 0.3978494623655914,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.504520297050476,
"learning_rate": 3.904109589041096e-06,
"loss": -0.0042,
"num_tokens": 11599953.0,
"reward": 0.9546874761581421,
"reward_std": 0.15246990323066711,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.17656250298023224,
"rewards/reward_coverage/std": 0.1318274438381195,
"rewards/reward_repetition/mean": 0.7781250476837158,
"rewards/reward_repetition/std": 0.12404395639896393,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7589582800865173,
"sampling/importance_sampling_ratio/min": 1.2950150141564556e-22,
"sampling/sampling_logp_difference/max": 50.39834976196289,
"sampling/sampling_logp_difference/mean": 2.7161073684692383,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.27736608777195215,
"epoch": 0.40860215053763443,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.6770557165145874,
"learning_rate": 3.8356164383561645e-06,
"loss": -0.0018,
"num_tokens": 11990363.0,
"reward": 0.926562488079071,
"reward_std": 0.14363107085227966,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.17031250894069672,
"rewards/reward_coverage/std": 0.15293914079666138,
"rewards/reward_repetition/mean": 0.7562500238418579,
"rewards/reward_repetition/std": 0.1152980849146843,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7690252661705017,
"sampling/importance_sampling_ratio/min": 5.327883295646056e-16,
"sampling/sampling_logp_difference/max": 35.16840744018555,
"sampling/sampling_logp_difference/mean": 2.6216495037078857,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.8125,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.30535601382143795,
"epoch": 0.41935483870967744,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.455296277999878,
"learning_rate": 3.767123287671233e-06,
"loss": -0.0019,
"num_tokens": 12380581.0,
"reward": 0.948437511920929,
"reward_std": 0.2010834813117981,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1796875,
"rewards/reward_coverage/std": 0.16825877130031586,
"rewards/reward_repetition/mean": 0.7687499523162842,
"rewards/reward_repetition/std": 0.12456272542476654,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7560758590698242,
"sampling/importance_sampling_ratio/min": 1.0004220113236836e-18,
"sampling/sampling_logp_difference/max": 41.446109771728516,
"sampling/sampling_logp_difference/mean": 2.5334110260009766,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.30565810902044177,
"epoch": 0.43010752688172044,
"frac_reward_zero_std": 0.03125,
"grad_norm": 1.2029423713684082,
"learning_rate": 3.6986301369863014e-06,
"loss": -0.0015,
"num_tokens": 12770797.0,
"reward": 0.96875,
"reward_std": 0.19003495573997498,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19062501192092896,
"rewards/reward_coverage/std": 0.15504096448421478,
"rewards/reward_repetition/mean": 0.778124988079071,
"rewards/reward_repetition/std": 0.12404395639896393,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7597452402114868,
"sampling/importance_sampling_ratio/min": 1.379195585862747e-12,
"sampling/sampling_logp_difference/max": 27.309520721435547,
"sampling/sampling_logp_difference/mean": 2.4547033309936523,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 33.0,
"completions/mean_length": 39.6875,
"completions/mean_terminated_length": 33.0,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.31394060072489083,
"epoch": 0.44086021505376344,
"frac_reward_zero_std": 0.0625,
"grad_norm": 1.7512142658233643,
"learning_rate": 3.6301369863013704e-06,
"loss": 0.0003,
"num_tokens": 13161191.0,
"reward": 1.009374976158142,
"reward_std": 0.18561550974845886,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.22499999403953552,
"rewards/reward_coverage/std": 0.15936382114887238,
"rewards/reward_repetition/mean": 0.7843749523162842,
"rewards/reward_repetition/std": 0.13940775394439697,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7532055974006653,
"sampling/importance_sampling_ratio/min": 3.4107995156513595e-17,
"sampling/sampling_logp_difference/max": 37.91699981689453,
"sampling/sampling_logp_difference/mean": 2.571798086166382,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.301532520679757,
"epoch": 0.45161290322580644,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.6585807800292969,
"learning_rate": 3.5616438356164386e-06,
"loss": -0.0042,
"num_tokens": 13551781.0,
"reward": 0.953125,
"reward_std": 0.13258251547813416,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19375000894069672,
"rewards/reward_coverage/std": 0.1562202423810959,
"rewards/reward_repetition/mean": 0.7593749761581421,
"rewards/reward_repetition/std": 0.10796640068292618,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7576152682304382,
"sampling/importance_sampling_ratio/min": 4.25440330295826e-18,
"sampling/sampling_logp_difference/max": 39.99857711791992,
"sampling/sampling_logp_difference/mean": 2.5987966060638428,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2994292816147208,
"epoch": 0.46236559139784944,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.1511462926864624,
"learning_rate": 3.4931506849315072e-06,
"loss": -0.0023,
"num_tokens": 13941920.0,
"reward": 0.9578125476837158,
"reward_std": 0.12595339119434357,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19218748807907104,
"rewards/reward_coverage/std": 0.15045401453971863,
"rewards/reward_repetition/mean": 0.765625,
"rewards/reward_repetition/std": 0.10422617197036743,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7622615694999695,
"sampling/importance_sampling_ratio/min": 1.1524427466063367e-15,
"sampling/sampling_logp_difference/max": 34.39689254760742,
"sampling/sampling_logp_difference/mean": 2.527230978012085,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.9375,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.2939116738270968,
"epoch": 0.4731182795698925,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.2386677265167236,
"learning_rate": 3.4246575342465754e-06,
"loss": -0.0099,
"num_tokens": 14332340.0,
"reward": 0.9515625238418579,
"reward_std": 0.1480504721403122,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19218750298023224,
"rewards/reward_coverage/std": 0.15461647510528564,
"rewards/reward_repetition/mean": 0.7593749761581421,
"rewards/reward_repetition/std": 0.12436345219612122,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7619537711143494,
"sampling/importance_sampling_ratio/min": 3.6812737413604546e-19,
"sampling/sampling_logp_difference/max": 42.445858001708984,
"sampling/sampling_logp_difference/mean": 2.5976765155792236,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.890625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2781213163398206,
"epoch": 0.4838709677419355,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.170350193977356,
"learning_rate": 3.356164383561644e-06,
"loss": -0.0036,
"num_tokens": 14722931.0,
"reward": 0.9953124523162842,
"reward_std": 0.12153396755456924,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1953125,
"rewards/reward_coverage/std": 0.14412261545658112,
"rewards/reward_repetition/mean": 0.800000011920929,
"rewards/reward_repetition/std": 0.08728715777397156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7608794569969177,
"sampling/importance_sampling_ratio/min": 3.331248850987206e-13,
"sampling/sampling_logp_difference/max": 28.73025894165039,
"sampling/sampling_logp_difference/mean": 2.6023521423339844,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.84375,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.27238480327650905,
"epoch": 0.4946236559139785,
"frac_reward_zero_std": 0.03125,
"grad_norm": 1.1889655590057373,
"learning_rate": 3.2876712328767123e-06,
"loss": -0.005,
"num_tokens": 15113515.0,
"reward": 0.9437500238418579,
"reward_std": 0.17235726118087769,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19062499701976776,
"rewards/reward_coverage/std": 0.14333748817443848,
"rewards/reward_repetition/mean": 0.7531249523162842,
"rewards/reward_repetition/std": 0.09915315359830856,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7437648177146912,
"sampling/importance_sampling_ratio/min": 3.385970521172965e-19,
"sampling/sampling_logp_difference/max": 42.529476165771484,
"sampling/sampling_logp_difference/mean": 2.778402328491211,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.890625,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.24875370506197214,
"epoch": 0.5053763440860215,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.1314892768859863,
"learning_rate": 3.2191780821917813e-06,
"loss": -0.0031,
"num_tokens": 15503912.0,
"reward": 0.942187488079071,
"reward_std": 0.15688931941986084,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18906250596046448,
"rewards/reward_coverage/std": 0.12230224162340164,
"rewards/reward_repetition/mean": 0.7531249523162842,
"rewards/reward_repetition/std": 0.12210943549871445,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7534346580505371,
"sampling/importance_sampling_ratio/min": 3.298192560128319e-15,
"sampling/sampling_logp_difference/max": 33.345401763916016,
"sampling/sampling_logp_difference/mean": 2.7750205993652344,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.875,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.24461835296824574,
"epoch": 0.5161290322580645,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.6554945111274719,
"learning_rate": 3.1506849315068495e-06,
"loss": -0.0048,
"num_tokens": 15894138.0,
"reward": 0.953125,
"reward_std": 0.16793785989284515,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21875,
"rewards/reward_coverage/std": 0.13554710149765015,
"rewards/reward_repetition/mean": 0.734375,
"rewards/reward_repetition/std": 0.12372364103794098,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7367614507675171,
"sampling/importance_sampling_ratio/min": 1.2913128950274503e-18,
"sampling/sampling_logp_difference/max": 41.19087219238281,
"sampling/sampling_logp_difference/mean": 2.8969175815582275,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.221066806698218,
"epoch": 0.5268817204301075,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.813700795173645,
"learning_rate": 3.082191780821918e-06,
"loss": -0.0125,
"num_tokens": 16284827.0,
"reward": 0.9203125238418579,
"reward_std": 0.1657281517982483,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18906250596046448,
"rewards/reward_coverage/std": 0.1310727298259735,
"rewards/reward_repetition/mean": 0.7312500476837158,
"rewards/reward_repetition/std": 0.12456272542476654,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7454463839530945,
"sampling/importance_sampling_ratio/min": 2.6984808100466543e-19,
"sampling/sampling_logp_difference/max": 42.75642776489258,
"sampling/sampling_logp_difference/mean": 3.0947470664978027,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2340390719473362,
"epoch": 0.5376344086021505,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.1709717512130737,
"learning_rate": 3.0136986301369864e-06,
"loss": -0.0098,
"num_tokens": 16675257.0,
"reward": 0.9140625596046448,
"reward_std": 0.13479222357273102,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18906250596046448,
"rewards/reward_coverage/std": 0.14155283570289612,
"rewards/reward_repetition/mean": 0.7250000238418579,
"rewards/reward_repetition/std": 0.10983392596244812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.732937216758728,
"sampling/importance_sampling_ratio/min": 1.6756041496787032e-16,
"sampling/sampling_logp_difference/max": 36.32518768310547,
"sampling/sampling_logp_difference/mean": 3.183100700378418,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 39.0,
"completions/mean_length": 39.78125,
"completions/mean_terminated_length": 39.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.20713584939949214,
"epoch": 0.5483870967741935,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.8335843086242676,
"learning_rate": 2.945205479452055e-06,
"loss": -0.0031,
"num_tokens": 17065845.0,
"reward": 0.953125,
"reward_std": 0.15026019513607025,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21875,
"rewards/reward_coverage/std": 0.1390158236026764,
"rewards/reward_repetition/mean": 0.734375,
"rewards/reward_repetition/std": 0.10722880065441132,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.739495038986206,
"sampling/importance_sampling_ratio/min": 4.677705163318169e-13,
"sampling/sampling_logp_difference/max": 28.390798568725586,
"sampling/sampling_logp_difference/mean": 3.198160409927368,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.9375,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.19506761734373868,
"epoch": 0.5591397849462365,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.6900437474250793,
"learning_rate": 2.876712328767123e-06,
"loss": -0.0002,
"num_tokens": 17456339.0,
"reward": 0.9359375238418579,
"reward_std": 0.17898640036582947,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20468750596046448,
"rewards/reward_coverage/std": 0.17129728198051453,
"rewards/reward_repetition/mean": 0.731249988079071,
"rewards/reward_repetition/std": 0.09574270248413086,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7362239956855774,
"sampling/importance_sampling_ratio/min": 2.602479520623457e-19,
"sampling/sampling_logp_difference/max": 42.79265213012695,
"sampling/sampling_logp_difference/mean": 3.2387518882751465,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 40.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.1951053044758737,
"epoch": 0.5698924731182796,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0901892185211182,
"learning_rate": 2.8082191780821922e-06,
"loss": -0.0096,
"num_tokens": 17846929.0,
"reward": 0.9390625357627869,
"reward_std": 0.19224466383457184,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.2109375,
"rewards/reward_coverage/std": 0.17193317413330078,
"rewards/reward_repetition/mean": 0.7281249761581421,
"rewards/reward_repetition/std": 0.10307764261960983,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7550839781761169,
"sampling/importance_sampling_ratio/min": 1.1955911409525077e-18,
"sampling/sampling_logp_difference/max": 41.26789093017578,
"sampling/sampling_logp_difference/mean": 3.1413958072662354,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.765625,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.21364939608611166,
"epoch": 0.5806451612903226,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.0280330181121826,
"learning_rate": 2.7397260273972604e-06,
"loss": -0.0093,
"num_tokens": 18237620.0,
"reward": 0.9500000476837158,
"reward_std": 0.1414213478565216,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19062501192092896,
"rewards/reward_coverage/std": 0.1540137678384781,
"rewards/reward_repetition/mean": 0.7593749761581421,
"rewards/reward_repetition/std": 0.0885845422744751,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.738981306552887,
"sampling/importance_sampling_ratio/min": 4.2308257591772147e-13,
"sampling/sampling_logp_difference/max": 28.491209030151367,
"sampling/sampling_logp_difference/mean": 3.156113624572754,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.890625,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.24034091946668923,
"epoch": 0.5913978494623656,
"frac_reward_zero_std": 0.1875,
"grad_norm": 1.18429696559906,
"learning_rate": 2.671232876712329e-06,
"loss": -0.0049,
"num_tokens": 18628129.0,
"reward": 0.8953125476837158,
"reward_std": 0.1657281517982483,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1796875,
"rewards/reward_coverage/std": 0.1299324631690979,
"rewards/reward_repetition/mean": 0.715624988079071,
"rewards/reward_repetition/std": 0.11158134788274765,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7346891760826111,
"sampling/importance_sampling_ratio/min": 1.511831024952892e-13,
"sampling/sampling_logp_difference/max": 29.52028465270996,
"sampling/sampling_logp_difference/mean": 3.2208282947540283,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 33.0,
"completions/mean_length": 39.84375,
"completions/mean_terminated_length": 33.0,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.23257427848875523,
"epoch": 0.6021505376344086,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.6966381072998047,
"learning_rate": 2.6027397260273973e-06,
"loss": -0.0088,
"num_tokens": 19018723.0,
"reward": 0.9343750476837158,
"reward_std": 0.15026018023490906,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19687500596046448,
"rewards/reward_coverage/std": 0.1284446120262146,
"rewards/reward_repetition/mean": 0.737500011920929,
"rewards/reward_repetition/std": 0.106159508228302,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7426254153251648,
"sampling/importance_sampling_ratio/min": 7.579855932368998e-14,
"sampling/sampling_logp_difference/max": 30.210697174072266,
"sampling/sampling_logp_difference/mean": 3.221386194229126,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2281794489827007,
"epoch": 0.6129032258064516,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7310183048248291,
"learning_rate": 2.534246575342466e-06,
"loss": -0.0083,
"num_tokens": 19409232.0,
"reward": 0.9671875238418579,
"reward_std": 0.14363107085227966,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.22968751192092896,
"rewards/reward_coverage/std": 0.14979319274425507,
"rewards/reward_repetition/mean": 0.737500011920929,
"rewards/reward_repetition/std": 0.10000000149011612,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7459241151809692,
"sampling/importance_sampling_ratio/min": 1.3522516009469616e-19,
"sampling/sampling_logp_difference/max": 43.44734573364258,
"sampling/sampling_logp_difference/mean": 3.226292133331299,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24348380486480892,
"epoch": 0.6236559139784946,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.9412605166435242,
"learning_rate": 2.4657534246575345e-06,
"loss": -0.0063,
"num_tokens": 19799925.0,
"reward": 0.948437511920929,
"reward_std": 0.15246990323066711,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.14637655019760132,
"rewards/reward_repetition/mean": 0.746874988079071,
"rewards/reward_repetition/std": 0.10833332687616348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7302582263946533,
"sampling/importance_sampling_ratio/min": 1.1974077329752507e-18,
"sampling/sampling_logp_difference/max": 41.26637268066406,
"sampling/sampling_logp_difference/mean": 3.3912689685821533,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.22413912834599614,
"epoch": 0.6344086021505376,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.6372097730636597,
"learning_rate": 2.3972602739726027e-06,
"loss": -0.0078,
"num_tokens": 20190710.0,
"reward": 0.953125,
"reward_std": 0.11048543453216553,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1875,
"rewards/reward_coverage/std": 0.1278640329837799,
"rewards/reward_repetition/mean": 0.765625,
"rewards/reward_repetition/std": 0.08398554474115372,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7377102971076965,
"sampling/importance_sampling_ratio/min": 5.862870793152246e-21,
"sampling/sampling_logp_difference/max": 46.58564758300781,
"sampling/sampling_logp_difference/mean": 3.426024913787842,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.23593324795365334,
"epoch": 0.6451612903225806,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.7991491556167603,
"learning_rate": 2.3287671232876713e-06,
"loss": -0.0066,
"num_tokens": 20581308.0,
"reward": 0.9515625238418579,
"reward_std": 0.13921165466308594,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18906250596046448,
"rewards/reward_coverage/std": 0.12487097084522247,
"rewards/reward_repetition/mean": 0.7625000476837158,
"rewards/reward_repetition/std": 0.11751393228769302,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7420527935028076,
"sampling/importance_sampling_ratio/min": 1.627108762957747e-23,
"sampling/sampling_logp_difference/max": 52.472652435302734,
"sampling/sampling_logp_difference/mean": 3.326465606689453,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.9375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.25969044235534966,
"epoch": 0.6559139784946236,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.9682433009147644,
"learning_rate": 2.26027397260274e-06,
"loss": -0.0109,
"num_tokens": 20972092.0,
"reward": 0.9828125238418579,
"reward_std": 0.15688931941986084,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20781250298023224,
"rewards/reward_coverage/std": 0.1336955726146698,
"rewards/reward_repetition/mean": 0.7749999761581421,
"rewards/reward_repetition/std": 0.09759000688791275,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7352744936943054,
"sampling/importance_sampling_ratio/min": 4.3607164320474956e-13,
"sampling/sampling_logp_difference/max": 28.460969924926758,
"sampling/sampling_logp_difference/mean": 3.418292999267578,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.23248756467364728,
"epoch": 0.6666666666666666,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.9272934198379517,
"learning_rate": 2.191780821917808e-06,
"loss": -0.0034,
"num_tokens": 21362873.0,
"reward": 0.9390624761581421,
"reward_std": 0.13921163976192474,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18593750894069672,
"rewards/reward_coverage/std": 0.11800886690616608,
"rewards/reward_repetition/mean": 0.7531249523162842,
"rewards/reward_repetition/std": 0.1053621917963028,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7381278276443481,
"sampling/importance_sampling_ratio/min": 5.011335584784865e-14,
"sampling/sampling_logp_difference/max": 30.624488830566406,
"sampling/sampling_logp_difference/mean": 3.4408388137817383,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.890625,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.2272115428932011,
"epoch": 0.6774193548387096,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.6223781704902649,
"learning_rate": 2.123287671232877e-06,
"loss": -0.0014,
"num_tokens": 21753644.0,
"reward": 0.9937499761581421,
"reward_std": 0.16793787479400635,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20625001192092896,
"rewards/reward_coverage/std": 0.15210169553756714,
"rewards/reward_repetition/mean": 0.7875000238418579,
"rewards/reward_repetition/std": 0.08637312799692154,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7413582801818848,
"sampling/importance_sampling_ratio/min": 8.111444921513807e-17,
"sampling/sampling_logp_difference/max": 37.0506706237793,
"sampling/sampling_logp_difference/mean": 3.607243776321411,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2311963385436684,
"epoch": 0.6881720430107527,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.8859496116638184,
"learning_rate": 2.0547945205479454e-06,
"loss": 0.0046,
"num_tokens": 22144251.0,
"reward": 1.0171875953674316,
"reward_std": 0.15688931941986084,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.2421875,
"rewards/reward_coverage/std": 0.16407963633537292,
"rewards/reward_repetition/mean": 0.7749999761581421,
"rewards/reward_repetition/std": 0.09085135161876678,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7465033531188965,
"sampling/importance_sampling_ratio/min": 4.7572778301925226e-18,
"sampling/sampling_logp_difference/max": 39.88685607910156,
"sampling/sampling_logp_difference/mean": 3.552140951156616,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2426956002600491,
"epoch": 0.6989247311827957,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.269731044769287,
"learning_rate": 1.9863013698630136e-06,
"loss": -0.0092,
"num_tokens": 22535042.0,
"reward": 0.9437500238418579,
"reward_std": 0.16793784499168396,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.16562500596046448,
"rewards/reward_coverage/std": 0.1382644772529602,
"rewards/reward_repetition/mean": 0.778124988079071,
"rewards/reward_repetition/std": 0.10759823769330978,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.745509684085846,
"sampling/importance_sampling_ratio/min": 2.5291580594867795e-16,
"sampling/sampling_logp_difference/max": 35.913475036621094,
"sampling/sampling_logp_difference/mean": 3.580662488937378,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 40.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.25972409872338176,
"epoch": 0.7096774193548387,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.2360777854919434,
"learning_rate": 1.9178082191780823e-06,
"loss": -0.0021,
"num_tokens": 22925826.0,
"reward": 0.989062488079071,
"reward_std": 0.13921163976192474,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.1578548550605774,
"rewards/reward_repetition/mean": 0.7875000238418579,
"rewards/reward_repetition/std": 0.07867958396673203,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.742576003074646,
"sampling/importance_sampling_ratio/min": 3.8009925881678924e-15,
"sampling/sampling_logp_difference/max": 33.203514099121094,
"sampling/sampling_logp_difference/mean": 3.5737202167510986,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.9375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24233098467811942,
"epoch": 0.7204301075268817,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.0064667463302612,
"learning_rate": 1.8493150684931507e-06,
"loss": -0.0074,
"num_tokens": 23316620.0,
"reward": 0.9796874523162842,
"reward_std": 0.16130872070789337,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18281251192092896,
"rewards/reward_coverage/std": 0.13280214369297028,
"rewards/reward_repetition/mean": 0.796875,
"rewards/reward_repetition/std": 0.11542708426713943,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7610887885093689,
"sampling/importance_sampling_ratio/min": 3.512002747491507e-17,
"sampling/sampling_logp_difference/max": 37.887760162353516,
"sampling/sampling_logp_difference/mean": 3.572136878967285,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.26319174305535853,
"epoch": 0.7311827956989247,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0560470819473267,
"learning_rate": 1.7808219178082193e-06,
"loss": -0.012,
"num_tokens": 23707121.0,
"reward": 0.9937500953674316,
"reward_std": 0.16351842880249023,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21562500298023224,
"rewards/reward_coverage/std": 0.13359349966049194,
"rewards/reward_repetition/mean": 0.778124988079071,
"rewards/reward_repetition/std": 0.12404395639896393,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7442783713340759,
"sampling/importance_sampling_ratio/min": 1.375699243920652e-15,
"sampling/sampling_logp_difference/max": 34.21981430053711,
"sampling/sampling_logp_difference/mean": 3.5864293575286865,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.78125,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.27109498833306134,
"epoch": 0.7419354838709677,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.1270716190338135,
"learning_rate": 1.7123287671232877e-06,
"loss": -0.0065,
"num_tokens": 24097617.0,
"reward": 0.9968750476837158,
"reward_std": 0.1237436830997467,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20937500894069672,
"rewards/reward_coverage/std": 0.1376892626285553,
"rewards/reward_repetition/mean": 0.7875000238418579,
"rewards/reward_repetition/std": 0.09343531727790833,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7473452687263489,
"sampling/importance_sampling_ratio/min": 1.4495503789956396e-16,
"sampling/sampling_logp_difference/max": 36.47010803222656,
"sampling/sampling_logp_difference/mean": 3.4904568195343018,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.26768106454983354,
"epoch": 0.7526881720430108,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.1696867942810059,
"learning_rate": 1.6438356164383561e-06,
"loss": -0.0057,
"num_tokens": 24488387.0,
"reward": 1.0328125953674316,
"reward_std": 0.13921163976192474,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.22031250596046448,
"rewards/reward_coverage/std": 0.13590343296527863,
"rewards/reward_repetition/mean": 0.8125,
"rewards/reward_repetition/std": 0.11198072135448456,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7499436736106873,
"sampling/importance_sampling_ratio/min": 1.4622693992618306e-15,
"sampling/sampling_logp_difference/max": 34.15878677368164,
"sampling/sampling_logp_difference/mean": 3.6728289127349854,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24969360628165305,
"epoch": 0.7634408602150538,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.8783805966377258,
"learning_rate": 1.5753424657534248e-06,
"loss": -0.0031,
"num_tokens": 24879084.0,
"reward": 1.0343749523162842,
"reward_std": 0.1590990126132965,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19062501192092896,
"rewards/reward_coverage/std": 0.13179922103881836,
"rewards/reward_repetition/mean": 0.8437500596046448,
"rewards/reward_repetition/std": 0.10965313017368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7588081955909729,
"sampling/importance_sampling_ratio/min": 3.0506860601055286e-14,
"sampling/sampling_logp_difference/max": 31.120824813842773,
"sampling/sampling_logp_difference/mean": 3.584549903869629,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.796875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2575332070700824,
"epoch": 0.7741935483870968,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.6952300071716309,
"learning_rate": 1.5068493150684932e-06,
"loss": -0.0028,
"num_tokens": 25269863.0,
"reward": 1.029687523841858,
"reward_std": 0.14363105595111847,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1953125,
"rewards/reward_coverage/std": 0.1396477371454239,
"rewards/reward_repetition/mean": 0.8343750238418579,
"rewards/reward_repetition/std": 0.11014961451292038,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.767298698425293,
"sampling/importance_sampling_ratio/min": 1.6540065300593926e-15,
"sampling/sampling_logp_difference/max": 34.03557586669922,
"sampling/sampling_logp_difference/mean": 3.5412185192108154,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.796875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.27034957450814545,
"epoch": 0.7849462365591398,
"frac_reward_zero_std": 0.28125,
"grad_norm": 1.3305058479309082,
"learning_rate": 1.4383561643835616e-06,
"loss": -0.0086,
"num_tokens": 25660624.0,
"reward": 0.9765625,
"reward_std": 0.1303728073835373,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19218750298023224,
"rewards/reward_coverage/std": 0.11724982410669327,
"rewards/reward_repetition/mean": 0.784375011920929,
"rewards/reward_repetition/std": 0.10269193351268768,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7455124258995056,
"sampling/importance_sampling_ratio/min": 4.9776697520764746e-14,
"sampling/sampling_logp_difference/max": 30.631229400634766,
"sampling/sampling_logp_difference/mean": 3.557706594467163,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 40.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.2672195213381201,
"epoch": 0.7956989247311828,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.368238925933838,
"learning_rate": 1.3698630136986302e-06,
"loss": -0.0091,
"num_tokens": 26051389.0,
"reward": 1.0171875953674316,
"reward_std": 0.15246988832950592,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1953125,
"rewards/reward_coverage/std": 0.1361951231956482,
"rewards/reward_repetition/mean": 0.8218749761581421,
"rewards/reward_repetition/std": 0.10759823024272919,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7550181746482849,
"sampling/importance_sampling_ratio/min": 4.837896576403988e-13,
"sampling/sampling_logp_difference/max": 28.357126235961914,
"sampling/sampling_logp_difference/mean": 3.518388271331787,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2512192933354527,
"epoch": 0.8064516129032258,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.8075931072235107,
"learning_rate": 1.3013698630136986e-06,
"loss": -0.0055,
"num_tokens": 26442164.0,
"reward": 0.9906250238418579,
"reward_std": 0.12816309928894043,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.16562500596046448,
"rewards/reward_coverage/std": 0.12626346945762634,
"rewards/reward_repetition/mean": 0.8250000476837158,
"rewards/reward_repetition/std": 0.09085134416818619,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7599254250526428,
"sampling/importance_sampling_ratio/min": 1.1269083539586222e-12,
"sampling/sampling_logp_difference/max": 27.51154327392578,
"sampling/sampling_logp_difference/mean": 3.5641071796417236,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.25094706076197326,
"epoch": 0.8172043010752689,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.8058338165283203,
"learning_rate": 1.2328767123287673e-06,
"loss": -0.008,
"num_tokens": 26832860.0,
"reward": 1.0093750953674316,
"reward_std": 0.1767766773700714,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19374999403953552,
"rewards/reward_coverage/std": 0.16122055053710938,
"rewards/reward_repetition/mean": 0.815625011920929,
"rewards/reward_repetition/std": 0.10269193351268768,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7578780055046082,
"sampling/importance_sampling_ratio/min": 1.1848823085411635e-15,
"sampling/sampling_logp_difference/max": 34.36913299560547,
"sampling/sampling_logp_difference/mean": 3.489642858505249,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.90625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.26383460965007544,
"epoch": 0.8279569892473119,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.9045315384864807,
"learning_rate": 1.1643835616438357e-06,
"loss": -0.0039,
"num_tokens": 27223636.0,
"reward": 1.032812476158142,
"reward_std": 0.14363105595111847,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.12407395988702774,
"rewards/reward_repetition/mean": 0.831250011920929,
"rewards/reward_repetition/std": 0.12456272542476654,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7593458890914917,
"sampling/importance_sampling_ratio/min": 1.2729455923859382e-14,
"sampling/sampling_logp_difference/max": 31.994857788085938,
"sampling/sampling_logp_difference/mean": 3.625725030899048,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.859375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2579868610482663,
"epoch": 0.8387096774193549,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.0903428792953491,
"learning_rate": 1.095890410958904e-06,
"loss": -0.0077,
"num_tokens": 27614409.0,
"reward": 1.053125023841858,
"reward_std": 0.18119609355926514,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.22187501192092896,
"rewards/reward_coverage/std": 0.15682236850261688,
"rewards/reward_repetition/mean": 0.8312499523162842,
"rewards/reward_repetition/std": 0.12456272542476654,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7564480304718018,
"sampling/importance_sampling_ratio/min": 3.942305127637401e-14,
"sampling/sampling_logp_difference/max": 30.864425659179688,
"sampling/sampling_logp_difference/mean": 3.604686975479126,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24791082250885665,
"epoch": 0.8494623655913979,
"frac_reward_zero_std": 0.09375,
"grad_norm": 1.090151071548462,
"learning_rate": 1.0273972602739727e-06,
"loss": -0.0028,
"num_tokens": 28005198.0,
"reward": 1.029687523841858,
"reward_std": 0.1480504870414734,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19218750298023224,
"rewards/reward_coverage/std": 0.16837665438652039,
"rewards/reward_repetition/mean": 0.8375000357627869,
"rewards/reward_repetition/std": 0.106159508228302,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.771141767501831,
"sampling/importance_sampling_ratio/min": 3.6129458528665753e-14,
"sampling/sampling_logp_difference/max": 30.95166778564453,
"sampling/sampling_logp_difference/mean": 3.4720849990844727,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.859375,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.25930464873090386,
"epoch": 0.8602150537634409,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.083723545074463,
"learning_rate": 9.589041095890411e-07,
"loss": -0.0051,
"num_tokens": 28395507.0,
"reward": 1.046875,
"reward_std": 0.15026018023490906,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.23125000298023224,
"rewards/reward_coverage/std": 0.14015299081802368,
"rewards/reward_repetition/mean": 0.815625011920929,
"rewards/reward_repetition/std": 0.152459979057312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7536813020706177,
"sampling/importance_sampling_ratio/min": 9.19962348487624e-14,
"sampling/sampling_logp_difference/max": 30.01702880859375,
"sampling/sampling_logp_difference/mean": 3.546005964279175,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 40.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.24061511480249465,
"epoch": 0.8709677419354839,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.8584389686584473,
"learning_rate": 8.904109589041097e-07,
"loss": -0.0066,
"num_tokens": 28786289.0,
"reward": 1.0609374046325684,
"reward_std": 0.1303727924823761,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.2109375,
"rewards/reward_coverage/std": 0.15130877494812012,
"rewards/reward_repetition/mean": 0.8500000238418579,
"rewards/reward_repetition/std": 0.10690449178218842,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7686938643455505,
"sampling/importance_sampling_ratio/min": 2.277888706651854e-13,
"sampling/sampling_logp_difference/max": 29.1103572845459,
"sampling/sampling_logp_difference/mean": 3.516740560531616,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 39.84375,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.23985581938177347,
"epoch": 0.8817204301075269,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.9327372312545776,
"learning_rate": 8.219178082191781e-07,
"loss": -0.0095,
"num_tokens": 29177061.0,
"reward": 1.0640625953674316,
"reward_std": 0.15688930451869965,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.21718749403953552,
"rewards/reward_coverage/std": 0.13634072244167328,
"rewards/reward_repetition/mean": 0.846875011920929,
"rewards/reward_repetition/std": 0.12210942804813385,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7691041827201843,
"sampling/importance_sampling_ratio/min": 9.13759844699269e-13,
"sampling/sampling_logp_difference/max": 27.721208572387695,
"sampling/sampling_logp_difference/mean": 3.6279892921447754,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 33.0,
"completions/mean_length": 39.609375,
"completions/mean_terminated_length": 33.0,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.2540010770317167,
"epoch": 0.8924731182795699,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7506739497184753,
"learning_rate": 7.534246575342466e-07,
"loss": -0.0126,
"num_tokens": 29567814.0,
"reward": 1.0328125953674316,
"reward_std": 0.1834058165550232,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156249403953552,
"rewards/reward_coverage/std": 0.13857802748680115,
"rewards/reward_repetition/mean": 0.831250011920929,
"rewards/reward_repetition/std": 0.12456272542476654,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7523236274719238,
"sampling/importance_sampling_ratio/min": 5.4132771128059115e-14,
"sampling/sampling_logp_difference/max": 30.54733657836914,
"sampling/sampling_logp_difference/mean": 3.681413173675537,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.875,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.25169974751770496,
"epoch": 0.9032258064516129,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.8115216493606567,
"learning_rate": 6.849315068493151e-07,
"loss": -0.0139,
"num_tokens": 29958486.0,
"reward": 1.0359375476837158,
"reward_std": 0.13479222357273102,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.18593750894069672,
"rewards/reward_coverage/std": 0.14014413952827454,
"rewards/reward_repetition/mean": 0.8500000238418579,
"rewards/reward_repetition/std": 0.10690449178218842,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.755419909954071,
"sampling/importance_sampling_ratio/min": 5.566194003652804e-14,
"sampling/sampling_logp_difference/max": 30.519479751586914,
"sampling/sampling_logp_difference/mean": 3.5999581813812256,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2387481287587434,
"epoch": 0.9139784946236559,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.7745798826217651,
"learning_rate": 6.164383561643836e-07,
"loss": -0.0112,
"num_tokens": 30349169.0,
"reward": 1.0515624284744263,
"reward_std": 0.1834058165550232,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156250894069672,
"rewards/reward_coverage/std": 0.15274441242218018,
"rewards/reward_repetition/mean": 0.8500000238418579,
"rewards/reward_repetition/std": 0.11818736046552658,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.76320481300354,
"sampling/importance_sampling_ratio/min": 1.7256138737983123e-13,
"sampling/sampling_logp_difference/max": 29.388023376464844,
"sampling/sampling_logp_difference/mean": 3.6501305103302,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.765625,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2610483162570745,
"epoch": 0.9247311827956989,
"frac_reward_zero_std": 0.21875,
"grad_norm": 1.2502846717834473,
"learning_rate": 5.47945205479452e-07,
"loss": -0.0119,
"num_tokens": 30739838.0,
"reward": 1.0515625476837158,
"reward_std": 0.12595339119434357,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20781250298023224,
"rewards/reward_coverage/std": 0.12885908782482147,
"rewards/reward_repetition/mean": 0.84375,
"rewards/reward_repetition/std": 0.11529809236526489,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7513002753257751,
"sampling/importance_sampling_ratio/min": 2.8611465139379705e-14,
"sampling/sampling_logp_difference/max": 31.184968948364258,
"sampling/sampling_logp_difference/mean": 3.6847875118255615,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.23906523222103715,
"epoch": 0.9354838709677419,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.7462826371192932,
"learning_rate": 4.794520547945206e-07,
"loss": -0.0061,
"num_tokens": 31130530.0,
"reward": 1.045312523841858,
"reward_std": 0.1303728073835373,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156250894069672,
"rewards/reward_coverage/std": 0.13391800224781036,
"rewards/reward_repetition/mean": 0.84375,
"rewards/reward_repetition/std": 0.10965313017368317,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7613617777824402,
"sampling/importance_sampling_ratio/min": 9.466830531296155e-13,
"sampling/sampling_logp_difference/max": 27.68581199645996,
"sampling/sampling_logp_difference/mean": 3.6363892555236816,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2408477501012385,
"epoch": 0.946236559139785,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0979928970336914,
"learning_rate": 4.1095890410958903e-07,
"loss": -0.0084,
"num_tokens": 31521313.0,
"reward": 1.0375001430511475,
"reward_std": 0.1590990126132965,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.19062499701976776,
"rewards/reward_coverage/std": 0.14662203192710876,
"rewards/reward_repetition/mean": 0.846875011920929,
"rewards/reward_repetition/std": 0.09915315359830856,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7597459554672241,
"sampling/importance_sampling_ratio/min": 1.093270764716825e-12,
"sampling/sampling_logp_difference/max": 27.541847229003906,
"sampling/sampling_logp_difference/mean": 3.6034116744995117,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 40.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 39.859375,
"completions/mean_terminated_length": 37.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.254688891582191,
"epoch": 0.956989247311828,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.8742080926895142,
"learning_rate": 3.4246575342465755e-07,
"loss": -0.0096,
"num_tokens": 31911900.0,
"reward": 1.0859375,
"reward_std": 0.183405801653862,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.2515625059604645,
"rewards/reward_coverage/std": 0.16522441804409027,
"rewards/reward_repetition/mean": 0.8343749642372131,
"rewards/reward_repetition/std": 0.13119566440582275,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7481080889701843,
"sampling/importance_sampling_ratio/min": 1.0778268598939867e-16,
"sampling/sampling_logp_difference/max": 36.766414642333984,
"sampling/sampling_logp_difference/mean": 3.735830307006836,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24119682773016393,
"epoch": 0.967741935483871,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.9537080526351929,
"learning_rate": 2.73972602739726e-07,
"loss": -0.0102,
"num_tokens": 32302677.0,
"reward": 1.078125,
"reward_std": 0.15026018023490906,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.22812499105930328,
"rewards/reward_coverage/std": 0.15783129632472992,
"rewards/reward_repetition/mean": 0.8500000238418579,
"rewards/reward_repetition/std": 0.11268723756074905,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7631887793540955,
"sampling/importance_sampling_ratio/min": 4.8462983300891216e-14,
"sampling/sampling_logp_difference/max": 30.657976150512695,
"sampling/sampling_logp_difference/mean": 3.6851108074188232,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.24475648440420628,
"epoch": 0.978494623655914,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.890794038772583,
"learning_rate": 2.0547945205479452e-07,
"loss": -0.004,
"num_tokens": 32693369.0,
"reward": 1.0171875953674316,
"reward_std": 0.14363105595111847,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.17343750596046448,
"rewards/reward_coverage/std": 0.13362134993076324,
"rewards/reward_repetition/mean": 0.84375,
"rewards/reward_repetition/std": 0.11529809236526489,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7631818652153015,
"sampling/importance_sampling_ratio/min": 3.372482685910089e-14,
"sampling/sampling_logp_difference/max": 31.02054214477539,
"sampling/sampling_logp_difference/mean": 3.65973162651062,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 40.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 40.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2510380311869085,
"epoch": 0.989247311827957,
"frac_reward_zero_std": 0.09375,
"grad_norm": 0.9964897632598877,
"learning_rate": 1.36986301369863e-07,
"loss": -0.0126,
"num_tokens": 33084143.0,
"reward": 1.0359375476837158,
"reward_std": 0.17898640036582947,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.1953125,
"rewards/reward_coverage/std": 0.1385064274072647,
"rewards/reward_repetition/mean": 0.8406250476837158,
"rewards/reward_repetition/std": 0.134186252951622,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.753570556640625,
"sampling/importance_sampling_ratio/min": 6.38058653628466e-15,
"sampling/sampling_logp_difference/max": 32.685516357421875,
"sampling/sampling_logp_difference/mean": 3.6861093044281006,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 39.953125,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 0.0,
"entropy": 0.2551991257350892,
"epoch": 1.0,
"frac_reward_zero_std": 0.15625,
"grad_norm": 1.1012099981307983,
"learning_rate": 6.84931506849315e-08,
"loss": -0.0126,
"num_tokens": 33474818.0,
"reward": 1.0421874523162842,
"reward_std": 0.15246988832950592,
"rewards/reward_correct/mean": 0.0,
"rewards/reward_correct/std": 0.0,
"rewards/reward_coverage/mean": 0.20156250894069672,
"rewards/reward_coverage/std": 0.1290898472070694,
"rewards/reward_repetition/mean": 0.8406250476837158,
"rewards/reward_repetition/std": 0.1293681114912033,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.7444443702697754,
"sampling/importance_sampling_ratio/min": 2.801043902031508e-13,
"sampling/sampling_logp_difference/max": 28.903614044189453,
"sampling/sampling_logp_difference/mean": 3.683718204498291,
"step": 93
}
],
"logging_steps": 1,
"max_steps": 93,
"num_input_tokens_seen": 33474818,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}