Phi-4-Argunaut-1-SPIN-dev1 / trainer_state.json
ggbetz's picture
Model save
f039fdc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03205128205128205,
"grad_norm": 17.482923590326937,
"learning_rate": 1.25e-07,
"logits/chosen": -2.0185546875,
"logits/rejected": -1.881250023841858,
"logps/chosen": -235.671875,
"logps/rejected": -302.26873779296875,
"loss": 0.38,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 1.7926757335662842,
"rewards/margins": 3.30548095703125,
"rewards/rejected": -1.5131103992462158,
"step": 5
},
{
"epoch": 0.0641025641025641,
"grad_norm": 10.731624225914716,
"learning_rate": 2.8125e-07,
"logits/chosen": -2.0074219703674316,
"logits/rejected": -1.779687523841858,
"logps/chosen": -220.52969360351562,
"logps/rejected": -415.6499938964844,
"loss": 0.4128,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 1.885888695716858,
"rewards/margins": 3.5706787109375,
"rewards/rejected": -1.6814696788787842,
"step": 10
},
{
"epoch": 0.09615384615384616,
"grad_norm": 12.984171783098084,
"learning_rate": 4.375e-07,
"logits/chosen": -2.097851514816284,
"logits/rejected": -1.916015625,
"logps/chosen": -212.4031219482422,
"logps/rejected": -295.92657470703125,
"loss": 0.4187,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.880639672279358,
"rewards/margins": 3.52978515625,
"rewards/rejected": -1.648584008216858,
"step": 15
},
{
"epoch": 0.1282051282051282,
"grad_norm": 39.351112208545295,
"learning_rate": 4.949324324324325e-07,
"logits/chosen": -2.023632764816284,
"logits/rejected": -1.8396484851837158,
"logps/chosen": -228.0656280517578,
"logps/rejected": -375.046875,
"loss": 0.5019,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 1.77685546875,
"rewards/margins": 3.207226514816284,
"rewards/rejected": -1.429071068763733,
"step": 20
},
{
"epoch": 0.16025641025641027,
"grad_norm": 8.14663574686874,
"learning_rate": 4.864864864864865e-07,
"logits/chosen": -1.9912109375,
"logits/rejected": -1.8039062023162842,
"logps/chosen": -231.5593719482422,
"logps/rejected": -515.8937377929688,
"loss": 0.4089,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 1.800073266029358,
"rewards/margins": 3.571337938308716,
"rewards/rejected": -1.771520972251892,
"step": 25
},
{
"epoch": 0.19230769230769232,
"grad_norm": 16.42433782374998,
"learning_rate": 4.780405405405405e-07,
"logits/chosen": -1.983789086341858,
"logits/rejected": -1.7492187023162842,
"logps/chosen": -250.9375,
"logps/rejected": -518.8781127929688,
"loss": 0.3187,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 1.736535668373108,
"rewards/margins": 3.2618165016174316,
"rewards/rejected": -1.525964379310608,
"step": 30
},
{
"epoch": 0.22435897435897437,
"grad_norm": 16.266393432601383,
"learning_rate": 4.695945945945946e-07,
"logits/chosen": -2.0166015625,
"logits/rejected": -1.91015625,
"logps/chosen": -209.7781219482422,
"logps/rejected": -289.6625061035156,
"loss": 0.3953,
"rewards/accuracies": 0.8125,
"rewards/chosen": 2.050332546234131,
"rewards/margins": 3.6033051013946533,
"rewards/rejected": -1.550323486328125,
"step": 35
},
{
"epoch": 0.2564102564102564,
"grad_norm": 25.97441760074817,
"learning_rate": 4.611486486486486e-07,
"logits/chosen": -1.963476538658142,
"logits/rejected": -1.8292968273162842,
"logps/chosen": -283.6156311035156,
"logps/rejected": -328.79766845703125,
"loss": 0.4126,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.7740600109100342,
"rewards/margins": 3.731884717941284,
"rewards/rejected": -1.9563720226287842,
"step": 40
},
{
"epoch": 0.28846153846153844,
"grad_norm": 34.9734844548034,
"learning_rate": 4.5270270270270264e-07,
"logits/chosen": -2.0054688453674316,
"logits/rejected": -1.7921874523162842,
"logps/chosen": -274.5687561035156,
"logps/rejected": -334.1656188964844,
"loss": 0.377,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 2.108358860015869,
"rewards/margins": 4.065283298492432,
"rewards/rejected": -1.953271508216858,
"step": 45
},
{
"epoch": 0.32051282051282054,
"grad_norm": 11.913243260486984,
"learning_rate": 4.442567567567567e-07,
"logits/chosen": -2.0777344703674316,
"logits/rejected": -1.947656273841858,
"logps/chosen": -225.84219360351562,
"logps/rejected": -250.46249389648438,
"loss": 0.3373,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 2.101611375808716,
"rewards/margins": 3.5291504859924316,
"rewards/rejected": -1.426367163658142,
"step": 50
},
{
"epoch": 0.3525641025641026,
"grad_norm": 13.672519743198338,
"learning_rate": 4.3581081081081076e-07,
"logits/chosen": -2.114453077316284,
"logits/rejected": -1.878320336341858,
"logps/chosen": -331.421875,
"logps/rejected": -381.27264404296875,
"loss": 0.3941,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 1.777099609375,
"rewards/margins": 3.0150146484375,
"rewards/rejected": -1.236975073814392,
"step": 55
},
{
"epoch": 0.38461538461538464,
"grad_norm": 11.373635937771688,
"learning_rate": 4.2736486486486484e-07,
"logits/chosen": -2.139843702316284,
"logits/rejected": -1.938867211341858,
"logps/chosen": -233.5578155517578,
"logps/rejected": -377.8140563964844,
"loss": 0.3037,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 2.1447510719299316,
"rewards/margins": 3.566601514816284,
"rewards/rejected": -1.420263648033142,
"step": 60
},
{
"epoch": 0.4166666666666667,
"grad_norm": 14.605720279749862,
"learning_rate": 4.189189189189189e-07,
"logits/chosen": -1.915624976158142,
"logits/rejected": -1.8369140625,
"logps/chosen": -196.640625,
"logps/rejected": -297.3812561035156,
"loss": 0.3993,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 1.6602294445037842,
"rewards/margins": 2.8565430641174316,
"rewards/rejected": -1.1953613758087158,
"step": 65
},
{
"epoch": 0.44871794871794873,
"grad_norm": 12.129690582617949,
"learning_rate": 4.1047297297297296e-07,
"logits/chosen": -2.043750047683716,
"logits/rejected": -1.8582031726837158,
"logps/chosen": -269.55780029296875,
"logps/rejected": -349.8812561035156,
"loss": 0.2719,
"rewards/accuracies": 0.90625,
"rewards/chosen": 2.2075562477111816,
"rewards/margins": 3.8485350608825684,
"rewards/rejected": -1.6388671398162842,
"step": 70
},
{
"epoch": 0.4807692307692308,
"grad_norm": 10.894196057080642,
"learning_rate": 4.02027027027027e-07,
"logits/chosen": -2.0833983421325684,
"logits/rejected": -1.8416016101837158,
"logps/chosen": -218.09375,
"logps/rejected": -379.48126220703125,
"loss": 0.3121,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 2.30419921875,
"rewards/margins": 3.564453125,
"rewards/rejected": -1.260766625404358,
"step": 75
},
{
"epoch": 0.5128205128205128,
"grad_norm": 10.28529818518839,
"learning_rate": 3.935810810810811e-07,
"logits/chosen": -2.027539014816284,
"logits/rejected": -1.8759765625,
"logps/chosen": -280.046875,
"logps/rejected": -328.8125,
"loss": 0.2987,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 2.126843214035034,
"rewards/margins": 4.263671875,
"rewards/rejected": -2.134082078933716,
"step": 80
},
{
"epoch": 0.5448717948717948,
"grad_norm": 10.046587710863294,
"learning_rate": 3.851351351351351e-07,
"logits/chosen": -2.0047850608825684,
"logits/rejected": -1.8369140625,
"logps/chosen": -249.7734375,
"logps/rejected": -267.5843811035156,
"loss": 0.2905,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.1821961402893066,
"rewards/margins": 3.7508788108825684,
"rewards/rejected": -1.568945288658142,
"step": 85
},
{
"epoch": 0.5769230769230769,
"grad_norm": 15.815810554242594,
"learning_rate": 3.766891891891892e-07,
"logits/chosen": -1.9933593273162842,
"logits/rejected": -1.7882812023162842,
"logps/chosen": -285.0328063964844,
"logps/rejected": -244.2062530517578,
"loss": 0.2709,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 2.258129835128784,
"rewards/margins": 3.865039110183716,
"rewards/rejected": -1.611413598060608,
"step": 90
},
{
"epoch": 0.6089743589743589,
"grad_norm": 33.74436261855397,
"learning_rate": 3.682432432432432e-07,
"logits/chosen": -2.1357421875,
"logits/rejected": -1.894140601158142,
"logps/chosen": -245.30624389648438,
"logps/rejected": -478.70001220703125,
"loss": 0.2975,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 1.964135766029358,
"rewards/margins": 3.7166991233825684,
"rewards/rejected": -1.751123070716858,
"step": 95
},
{
"epoch": 0.6410256410256411,
"grad_norm": 9.910094853594128,
"learning_rate": 3.597972972972973e-07,
"logits/chosen": -2.024609327316284,
"logits/rejected": -1.865625023841858,
"logps/chosen": -264.28436279296875,
"logps/rejected": -300.07342529296875,
"loss": 0.3171,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.304931640625,
"rewards/margins": 3.8041014671325684,
"rewards/rejected": -1.499169945716858,
"step": 100
},
{
"epoch": 0.6730769230769231,
"grad_norm": 11.870080752446105,
"learning_rate": 3.5135135135135134e-07,
"logits/chosen": -2.0267577171325684,
"logits/rejected": -1.8447265625,
"logps/chosen": -259.40936279296875,
"logps/rejected": -498.8890686035156,
"loss": 0.2647,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.0042967796325684,
"rewards/margins": 3.547900438308716,
"rewards/rejected": -1.542810082435608,
"step": 105
},
{
"epoch": 0.7051282051282052,
"grad_norm": 8.196509121053467,
"learning_rate": 3.429054054054054e-07,
"logits/chosen": -2.0044922828674316,
"logits/rejected": -1.820703148841858,
"logps/chosen": -221.078125,
"logps/rejected": -435.2406311035156,
"loss": 0.3216,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.1091065406799316,
"rewards/margins": 3.4317383766174316,
"rewards/rejected": -1.322973608970642,
"step": 110
},
{
"epoch": 0.7371794871794872,
"grad_norm": 17.275173620118444,
"learning_rate": 3.3445945945945946e-07,
"logits/chosen": -2.083203077316284,
"logits/rejected": -1.875585913658142,
"logps/chosen": -243.6531219482422,
"logps/rejected": -477.7124938964844,
"loss": 0.2858,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.115771532058716,
"rewards/margins": 3.8758788108825684,
"rewards/rejected": -1.757867455482483,
"step": 115
},
{
"epoch": 0.7692307692307693,
"grad_norm": 6.190361827721572,
"learning_rate": 3.260135135135135e-07,
"logits/chosen": -2.025390625,
"logits/rejected": -1.809960961341858,
"logps/chosen": -276.1343688964844,
"logps/rejected": -320.890625,
"loss": 0.2334,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 2.201000928878784,
"rewards/margins": 4.690966606140137,
"rewards/rejected": -2.4933104515075684,
"step": 120
},
{
"epoch": 0.8012820512820513,
"grad_norm": 17.015787309272795,
"learning_rate": 3.175675675675675e-07,
"logits/chosen": -1.984960913658142,
"logits/rejected": -1.833593726158142,
"logps/chosen": -248.43905639648438,
"logps/rejected": -294.2093811035156,
"loss": 0.3588,
"rewards/accuracies": 0.875,
"rewards/chosen": 2.134960889816284,
"rewards/margins": 3.898571729660034,
"rewards/rejected": -1.761315941810608,
"step": 125
},
{
"epoch": 0.8333333333333334,
"grad_norm": 9.916096150406192,
"learning_rate": 3.091216216216216e-07,
"logits/chosen": -2.0804686546325684,
"logits/rejected": -1.899999976158142,
"logps/chosen": -237.94686889648438,
"logps/rejected": -357.84063720703125,
"loss": 0.2721,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 2.0490965843200684,
"rewards/margins": 3.5159668922424316,
"rewards/rejected": -1.4671142101287842,
"step": 130
},
{
"epoch": 0.8653846153846154,
"grad_norm": 19.43204229304952,
"learning_rate": 3.0067567567567564e-07,
"logits/chosen": -1.9519531726837158,
"logits/rejected": -1.7833983898162842,
"logps/chosen": -268.28436279296875,
"logps/rejected": -376.12188720703125,
"loss": 0.2836,
"rewards/accuracies": 0.84375,
"rewards/chosen": 2.1514039039611816,
"rewards/margins": 4.179858207702637,
"rewards/rejected": -2.028857469558716,
"step": 135
},
{
"epoch": 0.8974358974358975,
"grad_norm": 14.35891063544634,
"learning_rate": 2.922297297297297e-07,
"logits/chosen": -2.075976610183716,
"logits/rejected": -1.883203148841858,
"logps/chosen": -203.46875,
"logps/rejected": -299.484375,
"loss": 0.2292,
"rewards/accuracies": 0.90625,
"rewards/chosen": 2.165087938308716,
"rewards/margins": 3.933666944503784,
"rewards/rejected": -1.7722899913787842,
"step": 140
},
{
"epoch": 0.9294871794871795,
"grad_norm": 15.571301633489812,
"learning_rate": 2.8378378378378376e-07,
"logits/chosen": -2.001757860183716,
"logits/rejected": -1.8171875476837158,
"logps/chosen": -257.5218811035156,
"logps/rejected": -395.0625,
"loss": 0.2054,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 2.296875,
"rewards/margins": 4.208886623382568,
"rewards/rejected": -1.913354516029358,
"step": 145
},
{
"epoch": 0.9615384615384616,
"grad_norm": 10.241466724079272,
"learning_rate": 2.7533783783783784e-07,
"logits/chosen": -2.010937452316284,
"logits/rejected": -1.7804687023162842,
"logps/chosen": -263.1890563964844,
"logps/rejected": -553.5797119140625,
"loss": 0.2288,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.049511671066284,
"rewards/margins": 4.341113090515137,
"rewards/rejected": -2.291332960128784,
"step": 150
},
{
"epoch": 0.9935897435897436,
"grad_norm": 8.858764442710157,
"learning_rate": 2.6689189189189187e-07,
"logits/chosen": -2.089062452316284,
"logits/rejected": -1.937890648841858,
"logps/chosen": -220.54843139648438,
"logps/rejected": -315.1640625,
"loss": 0.2811,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 2.0038084983825684,
"rewards/margins": 3.595703125,
"rewards/rejected": -1.5892822742462158,
"step": 155
},
{
"epoch": 1.0256410256410255,
"grad_norm": 9.684185372221096,
"learning_rate": 2.5844594594594596e-07,
"logits/chosen": -2.1142578125,
"logits/rejected": -1.8634765148162842,
"logps/chosen": -242.419921875,
"logps/rejected": -744.0179443359375,
"loss": 0.2283,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 2.2816405296325684,
"rewards/margins": 4.6513671875,
"rewards/rejected": -2.3667969703674316,
"step": 160
},
{
"epoch": 1.0576923076923077,
"grad_norm": 16.39607167381079,
"learning_rate": 2.5e-07,
"logits/chosen": -2.0179686546325684,
"logits/rejected": -1.838281273841858,
"logps/chosen": -285.15155029296875,
"logps/rejected": -555.5343627929688,
"loss": 0.2506,
"rewards/accuracies": 0.90625,
"rewards/chosen": 2.1651368141174316,
"rewards/margins": 3.9991211891174316,
"rewards/rejected": -1.832617163658142,
"step": 165
},
{
"epoch": 1.0897435897435896,
"grad_norm": 12.71417077582037,
"learning_rate": 2.41554054054054e-07,
"logits/chosen": -1.9617187976837158,
"logits/rejected": -1.7705078125,
"logps/chosen": -229.18124389648438,
"logps/rejected": -388.0687561035156,
"loss": 0.2361,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 2.22900390625,
"rewards/margins": 4.506933689117432,
"rewards/rejected": -2.277844190597534,
"step": 170
},
{
"epoch": 1.1217948717948718,
"grad_norm": 12.537778856529926,
"learning_rate": 2.331081081081081e-07,
"logits/chosen": -2.107226610183716,
"logits/rejected": -1.90234375,
"logps/chosen": -215.9656219482422,
"logps/rejected": -324.7749938964844,
"loss": 0.2086,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 2.130053758621216,
"rewards/margins": 4.331640720367432,
"rewards/rejected": -2.203198194503784,
"step": 175
},
{
"epoch": 1.1538461538461537,
"grad_norm": 13.46928418539436,
"learning_rate": 2.2466216216216216e-07,
"logits/chosen": -1.991601586341858,
"logits/rejected": -1.790429711341858,
"logps/chosen": -239.49063110351562,
"logps/rejected": -397.62811279296875,
"loss": 0.2422,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 2.213427782058716,
"rewards/margins": 4.355273246765137,
"rewards/rejected": -2.143115282058716,
"step": 180
},
{
"epoch": 1.185897435897436,
"grad_norm": 16.721244760339083,
"learning_rate": 2.1621621621621622e-07,
"logits/chosen": -2.0091795921325684,
"logits/rejected": -1.8250000476837158,
"logps/chosen": -260.0874938964844,
"logps/rejected": -433.359375,
"loss": 0.3083,
"rewards/accuracies": 0.875,
"rewards/chosen": 2.1787109375,
"rewards/margins": 3.8218750953674316,
"rewards/rejected": -1.6440918445587158,
"step": 185
},
{
"epoch": 1.217948717948718,
"grad_norm": 6.4545177104485845,
"learning_rate": 2.0777027027027025e-07,
"logits/chosen": -2.0591797828674316,
"logits/rejected": -1.8468749523162842,
"logps/chosen": -235.1687469482422,
"logps/rejected": -273.58123779296875,
"loss": 0.2293,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 2.180835008621216,
"rewards/margins": 3.9361329078674316,
"rewards/rejected": -1.753662109375,
"step": 190
},
{
"epoch": 1.25,
"grad_norm": 13.390863695555577,
"learning_rate": 1.993243243243243e-07,
"logits/chosen": -2.015429735183716,
"logits/rejected": -1.857812523841858,
"logps/chosen": -247.94686889648438,
"logps/rejected": -330.03436279296875,
"loss": 0.2265,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 2.378588914871216,
"rewards/margins": 4.5875244140625,
"rewards/rejected": -2.206225633621216,
"step": 195
},
{
"epoch": 1.282051282051282,
"grad_norm": 7.467973287614187,
"learning_rate": 1.9087837837837837e-07,
"logits/chosen": -1.9660155773162842,
"logits/rejected": -1.7736327648162842,
"logps/chosen": -242.78125,
"logps/rejected": -278.73126220703125,
"loss": 0.3117,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.247753858566284,
"rewards/margins": 3.825390577316284,
"rewards/rejected": -1.5797607898712158,
"step": 200
},
{
"epoch": 1.314102564102564,
"grad_norm": 17.275598489252985,
"learning_rate": 1.8243243243243243e-07,
"logits/chosen": -2.0658202171325684,
"logits/rejected": -1.8517577648162842,
"logps/chosen": -226.86563110351562,
"logps/rejected": -353.4937438964844,
"loss": 0.254,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 2.29681396484375,
"rewards/margins": 4.391015529632568,
"rewards/rejected": -2.097705125808716,
"step": 205
},
{
"epoch": 1.3461538461538463,
"grad_norm": 10.44378100815255,
"learning_rate": 1.739864864864865e-07,
"logits/chosen": -2.025390625,
"logits/rejected": -1.800390601158142,
"logps/chosen": -193.6984405517578,
"logps/rejected": -289.08282470703125,
"loss": 0.2395,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.071337938308716,
"rewards/margins": 3.8559813499450684,
"rewards/rejected": -1.7841675281524658,
"step": 210
},
{
"epoch": 1.3782051282051282,
"grad_norm": 97.68377543207546,
"learning_rate": 1.6554054054054055e-07,
"logits/chosen": -1.9826171398162842,
"logits/rejected": -1.755859375,
"logps/chosen": -355.3062438964844,
"logps/rejected": -485.71563720703125,
"loss": 0.2301,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 2.012890577316284,
"rewards/margins": 4.411523342132568,
"rewards/rejected": -2.397656202316284,
"step": 215
},
{
"epoch": 1.4102564102564101,
"grad_norm": 10.308990703109377,
"learning_rate": 1.570945945945946e-07,
"logits/chosen": -2.0074219703674316,
"logits/rejected": -1.796484351158142,
"logps/chosen": -291.6031188964844,
"logps/rejected": -320.95001220703125,
"loss": 0.2796,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 1.8628418445587158,
"rewards/margins": 3.919921875,
"rewards/rejected": -2.0587158203125,
"step": 220
},
{
"epoch": 1.4423076923076923,
"grad_norm": 7.8418596148572455,
"learning_rate": 1.4864864864864866e-07,
"logits/chosen": -2.051953077316284,
"logits/rejected": -1.8517577648162842,
"logps/chosen": -221.51171875,
"logps/rejected": -234.59375,
"loss": 0.2821,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.9519531726837158,
"rewards/margins": 3.46875,
"rewards/rejected": -1.521032691001892,
"step": 225
},
{
"epoch": 1.4743589743589745,
"grad_norm": 19.99054898354689,
"learning_rate": 1.402027027027027e-07,
"logits/chosen": -1.9873046875,
"logits/rejected": -1.816015601158142,
"logps/chosen": -234.86874389648438,
"logps/rejected": -471.40625,
"loss": 0.2533,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.637182593345642,
"rewards/margins": 3.6805663108825684,
"rewards/rejected": -2.044677734375,
"step": 230
},
{
"epoch": 1.5064102564102564,
"grad_norm": 6.927398842550873,
"learning_rate": 1.3175675675675673e-07,
"logits/chosen": -2.116406202316284,
"logits/rejected": -1.890234351158142,
"logps/chosen": -236.1374969482422,
"logps/rejected": -326.1812438964844,
"loss": 0.2292,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.9124755859375,
"rewards/margins": 3.916796922683716,
"rewards/rejected": -2.0068116188049316,
"step": 235
},
{
"epoch": 1.5384615384615383,
"grad_norm": 19.26565095128112,
"learning_rate": 1.233108108108108e-07,
"logits/chosen": -1.9998047351837158,
"logits/rejected": -1.8044922351837158,
"logps/chosen": -215.8718719482422,
"logps/rejected": -273.55780029296875,
"loss": 0.2959,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.870080590248108,
"rewards/margins": 3.464648485183716,
"rewards/rejected": -1.59326171875,
"step": 240
},
{
"epoch": 1.5705128205128205,
"grad_norm": 23.53473342051176,
"learning_rate": 1.1486486486486487e-07,
"logits/chosen": -2.0589842796325684,
"logits/rejected": -1.871484398841858,
"logps/chosen": -288.2890625,
"logps/rejected": -367.5843811035156,
"loss": 0.2964,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.398657202720642,
"rewards/margins": 3.667285203933716,
"rewards/rejected": -2.267504930496216,
"step": 245
},
{
"epoch": 1.6025641025641026,
"grad_norm": 9.487545014290102,
"learning_rate": 1.0641891891891891e-07,
"logits/chosen": -2.122851610183716,
"logits/rejected": -1.9580078125,
"logps/chosen": -285.9078063964844,
"logps/rejected": -379.8843688964844,
"loss": 0.3032,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.5487792491912842,
"rewards/margins": 3.930615186691284,
"rewards/rejected": -2.3807129859924316,
"step": 250
},
{
"epoch": 1.6346153846153846,
"grad_norm": 34.335674181311,
"learning_rate": 9.797297297297297e-08,
"logits/chosen": -1.9865233898162842,
"logits/rejected": -1.8146483898162842,
"logps/chosen": -303.875,
"logps/rejected": -383.5625,
"loss": 0.2268,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 2.1684327125549316,
"rewards/margins": 4.968652248382568,
"rewards/rejected": -2.7986207008361816,
"step": 255
},
{
"epoch": 1.6666666666666665,
"grad_norm": 12.79380674096048,
"learning_rate": 8.952702702702702e-08,
"logits/chosen": -1.9738280773162842,
"logits/rejected": -1.810156226158142,
"logps/chosen": -224.6687469482422,
"logps/rejected": -420.2437438964844,
"loss": 0.2571,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.805883765220642,
"rewards/margins": 3.4756011962890625,
"rewards/rejected": -1.672705054283142,
"step": 260
},
{
"epoch": 1.6987179487179487,
"grad_norm": 5.288061215805156,
"learning_rate": 8.108108108108108e-08,
"logits/chosen": -2.00390625,
"logits/rejected": -1.875585913658142,
"logps/chosen": -238.39688110351562,
"logps/rejected": -330.8125,
"loss": 0.198,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 2.1185302734375,
"rewards/margins": 4.246289253234863,
"rewards/rejected": -2.1275634765625,
"step": 265
},
{
"epoch": 1.7307692307692308,
"grad_norm": 15.41954334878335,
"learning_rate": 7.263513513513512e-08,
"logits/chosen": -2.037890672683716,
"logits/rejected": -1.8224608898162842,
"logps/chosen": -284.765625,
"logps/rejected": -533.375,
"loss": 0.2335,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 1.4149596691131592,
"rewards/margins": 4.398095607757568,
"rewards/rejected": -2.9843382835388184,
"step": 270
},
{
"epoch": 1.7628205128205128,
"grad_norm": 7.763713271113468,
"learning_rate": 6.418918918918918e-08,
"logits/chosen": -2.0380859375,
"logits/rejected": -1.8134765625,
"logps/chosen": -225.25,
"logps/rejected": -430.89373779296875,
"loss": 0.2135,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.960363745689392,
"rewards/margins": 4.0166015625,
"rewards/rejected": -2.0557618141174316,
"step": 275
},
{
"epoch": 1.7948717948717947,
"grad_norm": 4.521033115765149,
"learning_rate": 5.574324324324324e-08,
"logits/chosen": -2.044140577316284,
"logits/rejected": -1.811914086341858,
"logps/chosen": -144.3640594482422,
"logps/rejected": -276.8843688964844,
"loss": 0.2559,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.030041456222534,
"rewards/margins": 3.295703172683716,
"rewards/rejected": -1.2666351795196533,
"step": 280
},
{
"epoch": 1.8269230769230769,
"grad_norm": 9.52459220706363,
"learning_rate": 4.72972972972973e-08,
"logits/chosen": -2.127734422683716,
"logits/rejected": -1.897070288658142,
"logps/chosen": -280.30157470703125,
"logps/rejected": -316.5171813964844,
"loss": 0.2486,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 1.485009789466858,
"rewards/margins": 3.428417921066284,
"rewards/rejected": -1.9419434070587158,
"step": 285
},
{
"epoch": 1.858974358974359,
"grad_norm": 8.906979076376345,
"learning_rate": 3.885135135135135e-08,
"logits/chosen": -2.0869140625,
"logits/rejected": -1.8507812023162842,
"logps/chosen": -217.7156219482422,
"logps/rejected": -319.4203186035156,
"loss": 0.2271,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.053515672683716,
"rewards/margins": 4.0621337890625,
"rewards/rejected": -2.010498046875,
"step": 290
},
{
"epoch": 1.891025641025641,
"grad_norm": 9.745814847980592,
"learning_rate": 3.040540540540541e-08,
"logits/chosen": -2.052539110183716,
"logits/rejected": -1.8537108898162842,
"logps/chosen": -241.68905639648438,
"logps/rejected": -401.51251220703125,
"loss": 0.2465,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.9554443359375,
"rewards/margins": 3.714062452316284,
"rewards/rejected": -1.7587372064590454,
"step": 295
},
{
"epoch": 1.9230769230769231,
"grad_norm": 10.279258989880054,
"learning_rate": 2.195945945945946e-08,
"logits/chosen": -1.91796875,
"logits/rejected": -1.7571289539337158,
"logps/chosen": -250.7375030517578,
"logps/rejected": -312.2203063964844,
"loss": 0.1891,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 2.004687547683716,
"rewards/margins": 4.478320121765137,
"rewards/rejected": -2.476269483566284,
"step": 300
},
{
"epoch": 1.9551282051282053,
"grad_norm": 6.84223500138646,
"learning_rate": 1.3513513513513514e-08,
"logits/chosen": -2.0062499046325684,
"logits/rejected": -1.889062523841858,
"logps/chosen": -267.1031188964844,
"logps/rejected": -407.0718688964844,
"loss": 0.2554,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.6201751232147217,
"rewards/margins": 3.678515672683716,
"rewards/rejected": -2.054980516433716,
"step": 305
},
{
"epoch": 1.9871794871794872,
"grad_norm": 6.084443681112896,
"learning_rate": 5.067567567567567e-09,
"logits/chosen": -2.08203125,
"logits/rejected": -1.8772461414337158,
"logps/chosen": -248.9656219482422,
"logps/rejected": -302.21875,
"loss": 0.2517,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 1.859375,
"rewards/margins": 3.5653076171875,
"rewards/rejected": -1.707067847251892,
"step": 310
},
{
"epoch": 2.0,
"step": 312,
"total_flos": 0.0,
"train_loss": 0.2867361557407257,
"train_runtime": 4280.6612,
"train_samples_per_second": 2.331,
"train_steps_per_second": 0.073
}
],
"logging_steps": 5,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}