{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03205128205128205, "grad_norm": 17.482923590326937, "learning_rate": 1.25e-07, "logits/chosen": -2.0185546875, "logits/rejected": -1.881250023841858, "logps/chosen": -235.671875, "logps/rejected": -302.26873779296875, "loss": 0.38, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.7926757335662842, "rewards/margins": 3.30548095703125, "rewards/rejected": -1.5131103992462158, "step": 5 }, { "epoch": 0.0641025641025641, "grad_norm": 10.731624225914716, "learning_rate": 2.8125e-07, "logits/chosen": -2.0074219703674316, "logits/rejected": -1.779687523841858, "logps/chosen": -220.52969360351562, "logps/rejected": -415.6499938964844, "loss": 0.4128, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.885888695716858, "rewards/margins": 3.5706787109375, "rewards/rejected": -1.6814696788787842, "step": 10 }, { "epoch": 0.09615384615384616, "grad_norm": 12.984171783098084, "learning_rate": 4.375e-07, "logits/chosen": -2.097851514816284, "logits/rejected": -1.916015625, "logps/chosen": -212.4031219482422, "logps/rejected": -295.92657470703125, "loss": 0.4187, "rewards/accuracies": 0.78125, "rewards/chosen": 1.880639672279358, "rewards/margins": 3.52978515625, "rewards/rejected": -1.648584008216858, "step": 15 }, { "epoch": 0.1282051282051282, "grad_norm": 39.351112208545295, "learning_rate": 4.949324324324325e-07, "logits/chosen": -2.023632764816284, "logits/rejected": -1.8396484851837158, "logps/chosen": -228.0656280517578, "logps/rejected": -375.046875, "loss": 0.5019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.77685546875, "rewards/margins": 3.207226514816284, "rewards/rejected": -1.429071068763733, "step": 20 }, { "epoch": 0.16025641025641027, "grad_norm": 8.14663574686874, "learning_rate": 4.864864864864865e-07, "logits/chosen": -1.9912109375, "logits/rejected": -1.8039062023162842, "logps/chosen": -231.5593719482422, "logps/rejected": -515.8937377929688, "loss": 0.4089, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.800073266029358, "rewards/margins": 3.571337938308716, "rewards/rejected": -1.771520972251892, "step": 25 }, { "epoch": 0.19230769230769232, "grad_norm": 16.42433782374998, "learning_rate": 4.780405405405405e-07, "logits/chosen": -1.983789086341858, "logits/rejected": -1.7492187023162842, "logps/chosen": -250.9375, "logps/rejected": -518.8781127929688, "loss": 0.3187, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 1.736535668373108, "rewards/margins": 3.2618165016174316, "rewards/rejected": -1.525964379310608, "step": 30 }, { "epoch": 0.22435897435897437, "grad_norm": 16.266393432601383, "learning_rate": 4.695945945945946e-07, "logits/chosen": -2.0166015625, "logits/rejected": -1.91015625, "logps/chosen": -209.7781219482422, "logps/rejected": -289.6625061035156, "loss": 0.3953, "rewards/accuracies": 0.8125, "rewards/chosen": 2.050332546234131, "rewards/margins": 3.6033051013946533, "rewards/rejected": -1.550323486328125, "step": 35 }, { "epoch": 0.2564102564102564, "grad_norm": 25.97441760074817, "learning_rate": 4.611486486486486e-07, "logits/chosen": -1.963476538658142, "logits/rejected": -1.8292968273162842, "logps/chosen": -283.6156311035156, "logps/rejected": -328.79766845703125, "loss": 0.4126, "rewards/accuracies": 0.8125, "rewards/chosen": 1.7740600109100342, "rewards/margins": 3.731884717941284, "rewards/rejected": -1.9563720226287842, "step": 40 }, { "epoch": 0.28846153846153844, "grad_norm": 34.9734844548034, "learning_rate": 4.5270270270270264e-07, "logits/chosen": -2.0054688453674316, "logits/rejected": -1.7921874523162842, "logps/chosen": -274.5687561035156, "logps/rejected": -334.1656188964844, "loss": 0.377, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.108358860015869, "rewards/margins": 4.065283298492432, "rewards/rejected": -1.953271508216858, "step": 45 }, { "epoch": 0.32051282051282054, "grad_norm": 11.913243260486984, "learning_rate": 4.442567567567567e-07, "logits/chosen": -2.0777344703674316, "logits/rejected": -1.947656273841858, "logps/chosen": -225.84219360351562, "logps/rejected": -250.46249389648438, "loss": 0.3373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.101611375808716, "rewards/margins": 3.5291504859924316, "rewards/rejected": -1.426367163658142, "step": 50 }, { "epoch": 0.3525641025641026, "grad_norm": 13.672519743198338, "learning_rate": 4.3581081081081076e-07, "logits/chosen": -2.114453077316284, "logits/rejected": -1.878320336341858, "logps/chosen": -331.421875, "logps/rejected": -381.27264404296875, "loss": 0.3941, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.777099609375, "rewards/margins": 3.0150146484375, "rewards/rejected": -1.236975073814392, "step": 55 }, { "epoch": 0.38461538461538464, "grad_norm": 11.373635937771688, "learning_rate": 4.2736486486486484e-07, "logits/chosen": -2.139843702316284, "logits/rejected": -1.938867211341858, "logps/chosen": -233.5578155517578, "logps/rejected": -377.8140563964844, "loss": 0.3037, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.1447510719299316, "rewards/margins": 3.566601514816284, "rewards/rejected": -1.420263648033142, "step": 60 }, { "epoch": 0.4166666666666667, "grad_norm": 14.605720279749862, "learning_rate": 4.189189189189189e-07, "logits/chosen": -1.915624976158142, "logits/rejected": -1.8369140625, "logps/chosen": -196.640625, "logps/rejected": -297.3812561035156, "loss": 0.3993, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.6602294445037842, "rewards/margins": 2.8565430641174316, "rewards/rejected": -1.1953613758087158, "step": 65 }, { "epoch": 0.44871794871794873, "grad_norm": 12.129690582617949, "learning_rate": 4.1047297297297296e-07, "logits/chosen": -2.043750047683716, "logits/rejected": -1.8582031726837158, "logps/chosen": -269.55780029296875, "logps/rejected": -349.8812561035156, "loss": 0.2719, "rewards/accuracies": 0.90625, "rewards/chosen": 2.2075562477111816, "rewards/margins": 3.8485350608825684, "rewards/rejected": -1.6388671398162842, "step": 70 }, { "epoch": 0.4807692307692308, "grad_norm": 10.894196057080642, "learning_rate": 4.02027027027027e-07, "logits/chosen": -2.0833983421325684, "logits/rejected": -1.8416016101837158, "logps/chosen": -218.09375, "logps/rejected": -379.48126220703125, "loss": 0.3121, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.30419921875, "rewards/margins": 3.564453125, "rewards/rejected": -1.260766625404358, "step": 75 }, { "epoch": 0.5128205128205128, "grad_norm": 10.28529818518839, "learning_rate": 3.935810810810811e-07, "logits/chosen": -2.027539014816284, "logits/rejected": -1.8759765625, "logps/chosen": -280.046875, "logps/rejected": -328.8125, "loss": 0.2987, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.126843214035034, "rewards/margins": 4.263671875, "rewards/rejected": -2.134082078933716, "step": 80 }, { "epoch": 0.5448717948717948, "grad_norm": 10.046587710863294, "learning_rate": 3.851351351351351e-07, "logits/chosen": -2.0047850608825684, "logits/rejected": -1.8369140625, "logps/chosen": -249.7734375, "logps/rejected": -267.5843811035156, "loss": 0.2905, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.1821961402893066, "rewards/margins": 3.7508788108825684, "rewards/rejected": -1.568945288658142, "step": 85 }, { "epoch": 0.5769230769230769, "grad_norm": 15.815810554242594, "learning_rate": 3.766891891891892e-07, "logits/chosen": -1.9933593273162842, "logits/rejected": -1.7882812023162842, "logps/chosen": -285.0328063964844, "logps/rejected": -244.2062530517578, "loss": 0.2709, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.258129835128784, "rewards/margins": 3.865039110183716, "rewards/rejected": -1.611413598060608, "step": 90 }, { "epoch": 0.6089743589743589, "grad_norm": 33.74436261855397, "learning_rate": 3.682432432432432e-07, "logits/chosen": -2.1357421875, "logits/rejected": -1.894140601158142, "logps/chosen": -245.30624389648438, "logps/rejected": -478.70001220703125, "loss": 0.2975, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.964135766029358, "rewards/margins": 3.7166991233825684, "rewards/rejected": -1.751123070716858, "step": 95 }, { "epoch": 0.6410256410256411, "grad_norm": 9.910094853594128, "learning_rate": 3.597972972972973e-07, "logits/chosen": -2.024609327316284, "logits/rejected": -1.865625023841858, "logps/chosen": -264.28436279296875, "logps/rejected": -300.07342529296875, "loss": 0.3171, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.304931640625, "rewards/margins": 3.8041014671325684, "rewards/rejected": -1.499169945716858, "step": 100 }, { "epoch": 0.6730769230769231, "grad_norm": 11.870080752446105, "learning_rate": 3.5135135135135134e-07, "logits/chosen": -2.0267577171325684, "logits/rejected": -1.8447265625, "logps/chosen": -259.40936279296875, "logps/rejected": -498.8890686035156, "loss": 0.2647, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.0042967796325684, "rewards/margins": 3.547900438308716, "rewards/rejected": -1.542810082435608, "step": 105 }, { "epoch": 0.7051282051282052, "grad_norm": 8.196509121053467, "learning_rate": 3.429054054054054e-07, "logits/chosen": -2.0044922828674316, "logits/rejected": -1.820703148841858, "logps/chosen": -221.078125, "logps/rejected": -435.2406311035156, "loss": 0.3216, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.1091065406799316, "rewards/margins": 3.4317383766174316, "rewards/rejected": -1.322973608970642, "step": 110 }, { "epoch": 0.7371794871794872, "grad_norm": 17.275173620118444, "learning_rate": 3.3445945945945946e-07, "logits/chosen": -2.083203077316284, "logits/rejected": -1.875585913658142, "logps/chosen": -243.6531219482422, "logps/rejected": -477.7124938964844, "loss": 0.2858, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.115771532058716, "rewards/margins": 3.8758788108825684, "rewards/rejected": -1.757867455482483, "step": 115 }, { "epoch": 0.7692307692307693, "grad_norm": 6.190361827721572, "learning_rate": 3.260135135135135e-07, "logits/chosen": -2.025390625, "logits/rejected": -1.809960961341858, "logps/chosen": -276.1343688964844, "logps/rejected": -320.890625, "loss": 0.2334, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.201000928878784, "rewards/margins": 4.690966606140137, "rewards/rejected": -2.4933104515075684, "step": 120 }, { "epoch": 0.8012820512820513, "grad_norm": 17.015787309272795, "learning_rate": 3.175675675675675e-07, "logits/chosen": -1.984960913658142, "logits/rejected": -1.833593726158142, "logps/chosen": -248.43905639648438, "logps/rejected": -294.2093811035156, "loss": 0.3588, "rewards/accuracies": 0.875, "rewards/chosen": 2.134960889816284, "rewards/margins": 3.898571729660034, "rewards/rejected": -1.761315941810608, "step": 125 }, { "epoch": 0.8333333333333334, "grad_norm": 9.916096150406192, "learning_rate": 3.091216216216216e-07, "logits/chosen": -2.0804686546325684, "logits/rejected": -1.899999976158142, "logps/chosen": -237.94686889648438, "logps/rejected": -357.84063720703125, "loss": 0.2721, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.0490965843200684, "rewards/margins": 3.5159668922424316, "rewards/rejected": -1.4671142101287842, "step": 130 }, { "epoch": 0.8653846153846154, "grad_norm": 19.43204229304952, "learning_rate": 3.0067567567567564e-07, "logits/chosen": -1.9519531726837158, "logits/rejected": -1.7833983898162842, "logps/chosen": -268.28436279296875, "logps/rejected": -376.12188720703125, "loss": 0.2836, "rewards/accuracies": 0.84375, "rewards/chosen": 2.1514039039611816, "rewards/margins": 4.179858207702637, "rewards/rejected": -2.028857469558716, "step": 135 }, { "epoch": 0.8974358974358975, "grad_norm": 14.35891063544634, "learning_rate": 2.922297297297297e-07, "logits/chosen": -2.075976610183716, "logits/rejected": -1.883203148841858, "logps/chosen": -203.46875, "logps/rejected": -299.484375, "loss": 0.2292, "rewards/accuracies": 0.90625, "rewards/chosen": 2.165087938308716, "rewards/margins": 3.933666944503784, "rewards/rejected": -1.7722899913787842, "step": 140 }, { "epoch": 0.9294871794871795, "grad_norm": 15.571301633489812, "learning_rate": 2.8378378378378376e-07, "logits/chosen": -2.001757860183716, "logits/rejected": -1.8171875476837158, "logps/chosen": -257.5218811035156, "logps/rejected": -395.0625, "loss": 0.2054, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.296875, "rewards/margins": 4.208886623382568, "rewards/rejected": -1.913354516029358, "step": 145 }, { "epoch": 0.9615384615384616, "grad_norm": 10.241466724079272, "learning_rate": 2.7533783783783784e-07, "logits/chosen": -2.010937452316284, "logits/rejected": -1.7804687023162842, "logps/chosen": -263.1890563964844, "logps/rejected": -553.5797119140625, "loss": 0.2288, "rewards/accuracies": 0.9375, "rewards/chosen": 2.049511671066284, "rewards/margins": 4.341113090515137, "rewards/rejected": -2.291332960128784, "step": 150 }, { "epoch": 0.9935897435897436, "grad_norm": 8.858764442710157, "learning_rate": 2.6689189189189187e-07, "logits/chosen": -2.089062452316284, "logits/rejected": -1.937890648841858, "logps/chosen": -220.54843139648438, "logps/rejected": -315.1640625, "loss": 0.2811, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.0038084983825684, "rewards/margins": 3.595703125, "rewards/rejected": -1.5892822742462158, "step": 155 }, { "epoch": 1.0256410256410255, "grad_norm": 9.684185372221096, "learning_rate": 2.5844594594594596e-07, "logits/chosen": -2.1142578125, "logits/rejected": -1.8634765148162842, "logps/chosen": -242.419921875, "logps/rejected": -744.0179443359375, "loss": 0.2283, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.2816405296325684, "rewards/margins": 4.6513671875, "rewards/rejected": -2.3667969703674316, "step": 160 }, { "epoch": 1.0576923076923077, "grad_norm": 16.39607167381079, "learning_rate": 2.5e-07, "logits/chosen": -2.0179686546325684, "logits/rejected": -1.838281273841858, "logps/chosen": -285.15155029296875, "logps/rejected": -555.5343627929688, "loss": 0.2506, "rewards/accuracies": 0.90625, "rewards/chosen": 2.1651368141174316, "rewards/margins": 3.9991211891174316, "rewards/rejected": -1.832617163658142, "step": 165 }, { "epoch": 1.0897435897435896, "grad_norm": 12.71417077582037, "learning_rate": 2.41554054054054e-07, "logits/chosen": -1.9617187976837158, "logits/rejected": -1.7705078125, "logps/chosen": -229.18124389648438, "logps/rejected": -388.0687561035156, "loss": 0.2361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.22900390625, "rewards/margins": 4.506933689117432, "rewards/rejected": -2.277844190597534, "step": 170 }, { "epoch": 1.1217948717948718, "grad_norm": 12.537778856529926, "learning_rate": 2.331081081081081e-07, "logits/chosen": -2.107226610183716, "logits/rejected": -1.90234375, "logps/chosen": -215.9656219482422, "logps/rejected": -324.7749938964844, "loss": 0.2086, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.130053758621216, "rewards/margins": 4.331640720367432, "rewards/rejected": -2.203198194503784, "step": 175 }, { "epoch": 1.1538461538461537, "grad_norm": 13.46928418539436, "learning_rate": 2.2466216216216216e-07, "logits/chosen": -1.991601586341858, "logits/rejected": -1.790429711341858, "logps/chosen": -239.49063110351562, "logps/rejected": -397.62811279296875, "loss": 0.2422, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.213427782058716, "rewards/margins": 4.355273246765137, "rewards/rejected": -2.143115282058716, "step": 180 }, { "epoch": 1.185897435897436, "grad_norm": 16.721244760339083, "learning_rate": 2.1621621621621622e-07, "logits/chosen": -2.0091795921325684, "logits/rejected": -1.8250000476837158, "logps/chosen": -260.0874938964844, "logps/rejected": -433.359375, "loss": 0.3083, "rewards/accuracies": 0.875, "rewards/chosen": 2.1787109375, "rewards/margins": 3.8218750953674316, "rewards/rejected": -1.6440918445587158, "step": 185 }, { "epoch": 1.217948717948718, "grad_norm": 6.4545177104485845, "learning_rate": 2.0777027027027025e-07, "logits/chosen": -2.0591797828674316, "logits/rejected": -1.8468749523162842, "logps/chosen": -235.1687469482422, "logps/rejected": -273.58123779296875, "loss": 0.2293, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 2.180835008621216, "rewards/margins": 3.9361329078674316, "rewards/rejected": -1.753662109375, "step": 190 }, { "epoch": 1.25, "grad_norm": 13.390863695555577, "learning_rate": 1.993243243243243e-07, "logits/chosen": -2.015429735183716, "logits/rejected": -1.857812523841858, "logps/chosen": -247.94686889648438, "logps/rejected": -330.03436279296875, "loss": 0.2265, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.378588914871216, "rewards/margins": 4.5875244140625, "rewards/rejected": -2.206225633621216, "step": 195 }, { "epoch": 1.282051282051282, "grad_norm": 7.467973287614187, "learning_rate": 1.9087837837837837e-07, "logits/chosen": -1.9660155773162842, "logits/rejected": -1.7736327648162842, "logps/chosen": -242.78125, "logps/rejected": -278.73126220703125, "loss": 0.3117, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.247753858566284, "rewards/margins": 3.825390577316284, "rewards/rejected": -1.5797607898712158, "step": 200 }, { "epoch": 1.314102564102564, "grad_norm": 17.275598489252985, "learning_rate": 1.8243243243243243e-07, "logits/chosen": -2.0658202171325684, "logits/rejected": -1.8517577648162842, "logps/chosen": -226.86563110351562, "logps/rejected": -353.4937438964844, "loss": 0.254, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.29681396484375, "rewards/margins": 4.391015529632568, "rewards/rejected": -2.097705125808716, "step": 205 }, { "epoch": 1.3461538461538463, "grad_norm": 10.44378100815255, "learning_rate": 1.739864864864865e-07, "logits/chosen": -2.025390625, "logits/rejected": -1.800390601158142, "logps/chosen": -193.6984405517578, "logps/rejected": -289.08282470703125, "loss": 0.2395, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.071337938308716, "rewards/margins": 3.8559813499450684, "rewards/rejected": -1.7841675281524658, "step": 210 }, { "epoch": 1.3782051282051282, "grad_norm": 97.68377543207546, "learning_rate": 1.6554054054054055e-07, "logits/chosen": -1.9826171398162842, "logits/rejected": -1.755859375, "logps/chosen": -355.3062438964844, "logps/rejected": -485.71563720703125, "loss": 0.2301, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.012890577316284, "rewards/margins": 4.411523342132568, "rewards/rejected": -2.397656202316284, "step": 215 }, { "epoch": 1.4102564102564101, "grad_norm": 10.308990703109377, "learning_rate": 1.570945945945946e-07, "logits/chosen": -2.0074219703674316, "logits/rejected": -1.796484351158142, "logps/chosen": -291.6031188964844, "logps/rejected": -320.95001220703125, "loss": 0.2796, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 1.8628418445587158, "rewards/margins": 3.919921875, "rewards/rejected": -2.0587158203125, "step": 220 }, { "epoch": 1.4423076923076923, "grad_norm": 7.8418596148572455, "learning_rate": 1.4864864864864866e-07, "logits/chosen": -2.051953077316284, "logits/rejected": -1.8517577648162842, "logps/chosen": -221.51171875, "logps/rejected": -234.59375, "loss": 0.2821, "rewards/accuracies": 0.84375, "rewards/chosen": 1.9519531726837158, "rewards/margins": 3.46875, "rewards/rejected": -1.521032691001892, "step": 225 }, { "epoch": 1.4743589743589745, "grad_norm": 19.99054898354689, "learning_rate": 1.402027027027027e-07, "logits/chosen": -1.9873046875, "logits/rejected": -1.816015601158142, "logps/chosen": -234.86874389648438, "logps/rejected": -471.40625, "loss": 0.2533, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.637182593345642, "rewards/margins": 3.6805663108825684, "rewards/rejected": -2.044677734375, "step": 230 }, { "epoch": 1.5064102564102564, "grad_norm": 6.927398842550873, "learning_rate": 1.3175675675675673e-07, "logits/chosen": -2.116406202316284, "logits/rejected": -1.890234351158142, "logps/chosen": -236.1374969482422, "logps/rejected": -326.1812438964844, "loss": 0.2292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.9124755859375, "rewards/margins": 3.916796922683716, "rewards/rejected": -2.0068116188049316, "step": 235 }, { "epoch": 1.5384615384615383, "grad_norm": 19.26565095128112, "learning_rate": 1.233108108108108e-07, "logits/chosen": -1.9998047351837158, "logits/rejected": -1.8044922351837158, "logps/chosen": -215.8718719482422, "logps/rejected": -273.55780029296875, "loss": 0.2959, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.870080590248108, "rewards/margins": 3.464648485183716, "rewards/rejected": -1.59326171875, "step": 240 }, { "epoch": 1.5705128205128205, "grad_norm": 23.53473342051176, "learning_rate": 1.1486486486486487e-07, "logits/chosen": -2.0589842796325684, "logits/rejected": -1.871484398841858, "logps/chosen": -288.2890625, "logps/rejected": -367.5843811035156, "loss": 0.2964, "rewards/accuracies": 0.875, "rewards/chosen": 1.398657202720642, "rewards/margins": 3.667285203933716, "rewards/rejected": -2.267504930496216, "step": 245 }, { "epoch": 1.6025641025641026, "grad_norm": 9.487545014290102, "learning_rate": 1.0641891891891891e-07, "logits/chosen": -2.122851610183716, "logits/rejected": -1.9580078125, "logps/chosen": -285.9078063964844, "logps/rejected": -379.8843688964844, "loss": 0.3032, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.5487792491912842, "rewards/margins": 3.930615186691284, "rewards/rejected": -2.3807129859924316, "step": 250 }, { "epoch": 1.6346153846153846, "grad_norm": 34.335674181311, "learning_rate": 9.797297297297297e-08, "logits/chosen": -1.9865233898162842, "logits/rejected": -1.8146483898162842, "logps/chosen": -303.875, "logps/rejected": -383.5625, "loss": 0.2268, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.1684327125549316, "rewards/margins": 4.968652248382568, "rewards/rejected": -2.7986207008361816, "step": 255 }, { "epoch": 1.6666666666666665, "grad_norm": 12.79380674096048, "learning_rate": 8.952702702702702e-08, "logits/chosen": -1.9738280773162842, "logits/rejected": -1.810156226158142, "logps/chosen": -224.6687469482422, "logps/rejected": -420.2437438964844, "loss": 0.2571, "rewards/accuracies": 0.9375, "rewards/chosen": 1.805883765220642, "rewards/margins": 3.4756011962890625, "rewards/rejected": -1.672705054283142, "step": 260 }, { "epoch": 1.6987179487179487, "grad_norm": 5.288061215805156, "learning_rate": 8.108108108108108e-08, "logits/chosen": -2.00390625, "logits/rejected": -1.875585913658142, "logps/chosen": -238.39688110351562, "logps/rejected": -330.8125, "loss": 0.198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.1185302734375, "rewards/margins": 4.246289253234863, "rewards/rejected": -2.1275634765625, "step": 265 }, { "epoch": 1.7307692307692308, "grad_norm": 15.41954334878335, "learning_rate": 7.263513513513512e-08, "logits/chosen": -2.037890672683716, "logits/rejected": -1.8224608898162842, "logps/chosen": -284.765625, "logps/rejected": -533.375, "loss": 0.2335, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 1.4149596691131592, "rewards/margins": 4.398095607757568, "rewards/rejected": -2.9843382835388184, "step": 270 }, { "epoch": 1.7628205128205128, "grad_norm": 7.763713271113468, "learning_rate": 6.418918918918918e-08, "logits/chosen": -2.0380859375, "logits/rejected": -1.8134765625, "logps/chosen": -225.25, "logps/rejected": -430.89373779296875, "loss": 0.2135, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.960363745689392, "rewards/margins": 4.0166015625, "rewards/rejected": -2.0557618141174316, "step": 275 }, { "epoch": 1.7948717948717947, "grad_norm": 4.521033115765149, "learning_rate": 5.574324324324324e-08, "logits/chosen": -2.044140577316284, "logits/rejected": -1.811914086341858, "logps/chosen": -144.3640594482422, "logps/rejected": -276.8843688964844, "loss": 0.2559, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.030041456222534, "rewards/margins": 3.295703172683716, "rewards/rejected": -1.2666351795196533, "step": 280 }, { "epoch": 1.8269230769230769, "grad_norm": 9.52459220706363, "learning_rate": 4.72972972972973e-08, "logits/chosen": -2.127734422683716, "logits/rejected": -1.897070288658142, "logps/chosen": -280.30157470703125, "logps/rejected": -316.5171813964844, "loss": 0.2486, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.485009789466858, "rewards/margins": 3.428417921066284, "rewards/rejected": -1.9419434070587158, "step": 285 }, { "epoch": 1.858974358974359, "grad_norm": 8.906979076376345, "learning_rate": 3.885135135135135e-08, "logits/chosen": -2.0869140625, "logits/rejected": -1.8507812023162842, "logps/chosen": -217.7156219482422, "logps/rejected": -319.4203186035156, "loss": 0.2271, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.053515672683716, "rewards/margins": 4.0621337890625, "rewards/rejected": -2.010498046875, "step": 290 }, { "epoch": 1.891025641025641, "grad_norm": 9.745814847980592, "learning_rate": 3.040540540540541e-08, "logits/chosen": -2.052539110183716, "logits/rejected": -1.8537108898162842, "logps/chosen": -241.68905639648438, "logps/rejected": -401.51251220703125, "loss": 0.2465, "rewards/accuracies": 0.90625, "rewards/chosen": 1.9554443359375, "rewards/margins": 3.714062452316284, "rewards/rejected": -1.7587372064590454, "step": 295 }, { "epoch": 1.9230769230769231, "grad_norm": 10.279258989880054, "learning_rate": 2.195945945945946e-08, "logits/chosen": -1.91796875, "logits/rejected": -1.7571289539337158, "logps/chosen": -250.7375030517578, "logps/rejected": -312.2203063964844, "loss": 0.1891, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.004687547683716, "rewards/margins": 4.478320121765137, "rewards/rejected": -2.476269483566284, "step": 300 }, { "epoch": 1.9551282051282053, "grad_norm": 6.84223500138646, "learning_rate": 1.3513513513513514e-08, "logits/chosen": -2.0062499046325684, "logits/rejected": -1.889062523841858, "logps/chosen": -267.1031188964844, "logps/rejected": -407.0718688964844, "loss": 0.2554, "rewards/accuracies": 0.90625, "rewards/chosen": 1.6201751232147217, "rewards/margins": 3.678515672683716, "rewards/rejected": -2.054980516433716, "step": 305 }, { "epoch": 1.9871794871794872, "grad_norm": 6.084443681112896, "learning_rate": 5.067567567567567e-09, "logits/chosen": -2.08203125, "logits/rejected": -1.8772461414337158, "logps/chosen": -248.9656219482422, "logps/rejected": -302.21875, "loss": 0.2517, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.859375, "rewards/margins": 3.5653076171875, "rewards/rejected": -1.707067847251892, "step": 310 }, { "epoch": 2.0, "step": 312, "total_flos": 0.0, "train_loss": 0.2867361557407257, "train_runtime": 4280.6612, "train_samples_per_second": 2.331, "train_steps_per_second": 0.073 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }