diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8243 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3228955268859863, + "epoch": 0.0024390243902439024, + "grad_norm": 2.4187467098236084, + "learning_rate": 0.0, + "loss": 2.5739, + "mean_token_accuracy": 0.5249772071838379, + "num_tokens": 13188.0, + "step": 1 + }, + { + "entropy": 1.3434489965438843, + "epoch": 0.004878048780487805, + "grad_norm": 2.1801252365112305, + "learning_rate": 1.2195121951219514e-07, + "loss": 2.5104, + "mean_token_accuracy": 0.5338258147239685, + "num_tokens": 26877.0, + "step": 2 + }, + { + "entropy": 1.338788390159607, + "epoch": 0.007317073170731708, + "grad_norm": 2.2766218185424805, + "learning_rate": 2.439024390243903e-07, + "loss": 2.539, + "mean_token_accuracy": 0.5276630520820618, + "num_tokens": 40449.0, + "step": 3 + }, + { + "entropy": 1.3349497318267822, + "epoch": 0.00975609756097561, + "grad_norm": 2.353504180908203, + "learning_rate": 3.6585365853658536e-07, + "loss": 2.5631, + "mean_token_accuracy": 0.5275854468345642, + "num_tokens": 53606.0, + "step": 4 + }, + { + "entropy": 1.3273138999938965, + "epoch": 0.012195121951219513, + "grad_norm": 2.307615041732788, + "learning_rate": 4.878048780487805e-07, + "loss": 2.5412, + "mean_token_accuracy": 0.5292984843254089, + "num_tokens": 67121.0, + "step": 5 + }, + { + "entropy": 1.3538919687271118, + "epoch": 0.014634146341463415, + "grad_norm": 2.3148252964019775, + "learning_rate": 6.097560975609757e-07, + "loss": 2.5457, + "mean_token_accuracy": 0.5319257378578186, + "num_tokens": 80872.0, + "step": 6 + }, + { + "entropy": 1.3218836784362793, + "epoch": 0.01707317073170732, + "grad_norm": 2.2965075969696045, + "learning_rate": 7.317073170731707e-07, + "loss": 2.5325, + "mean_token_accuracy": 0.5365909337997437, + "num_tokens": 94088.0, + "step": 7 + }, + { + "entropy": 1.3360720872879028, + "epoch": 0.01951219512195122, + "grad_norm": 2.219707727432251, + "learning_rate": 8.53658536585366e-07, + "loss": 2.5082, + "mean_token_accuracy": 0.5307488441467285, + "num_tokens": 107698.0, + "step": 8 + }, + { + "entropy": 1.3242058753967285, + "epoch": 0.02195121951219512, + "grad_norm": 2.1611530780792236, + "learning_rate": 9.75609756097561e-07, + "loss": 2.4874, + "mean_token_accuracy": 0.5399101376533508, + "num_tokens": 121733.0, + "step": 9 + }, + { + "entropy": 1.3375442028045654, + "epoch": 0.024390243902439025, + "grad_norm": 2.1313178539276123, + "learning_rate": 1.0975609756097562e-06, + "loss": 2.4416, + "mean_token_accuracy": 0.5365526676177979, + "num_tokens": 135934.0, + "step": 10 + }, + { + "entropy": 1.3702865839004517, + "epoch": 0.026829268292682926, + "grad_norm": 2.212796688079834, + "learning_rate": 1.2195121951219514e-06, + "loss": 2.5007, + "mean_token_accuracy": 0.5316236615180969, + "num_tokens": 149832.0, + "step": 11 + }, + { + "entropy": 1.3510187864303589, + "epoch": 0.02926829268292683, + "grad_norm": 2.1613738536834717, + "learning_rate": 1.3414634146341465e-06, + "loss": 2.4974, + "mean_token_accuracy": 0.5329979062080383, + "num_tokens": 163364.0, + "step": 12 + }, + { + "entropy": 1.3398377895355225, + "epoch": 0.03170731707317073, + "grad_norm": 2.3386099338531494, + "learning_rate": 1.4634146341463414e-06, + "loss": 2.5261, + "mean_token_accuracy": 0.5286726355552673, + "num_tokens": 176511.0, + "step": 13 + }, + { + "entropy": 1.3582022190093994, + "epoch": 0.03414634146341464, + "grad_norm": 2.2629709243774414, + "learning_rate": 1.5853658536585368e-06, + "loss": 2.507, + "mean_token_accuracy": 0.5306152701377869, + "num_tokens": 190115.0, + "step": 14 + }, + { + "entropy": 1.3705586194992065, + "epoch": 0.036585365853658534, + "grad_norm": 2.2367234230041504, + "learning_rate": 1.707317073170732e-06, + "loss": 2.4833, + "mean_token_accuracy": 0.5337990522384644, + "num_tokens": 203874.0, + "step": 15 + }, + { + "entropy": 1.3703287839889526, + "epoch": 0.03902439024390244, + "grad_norm": 2.342644691467285, + "learning_rate": 1.8292682926829268e-06, + "loss": 2.5473, + "mean_token_accuracy": 0.5246161818504333, + "num_tokens": 217113.0, + "step": 16 + }, + { + "entropy": 1.3541935682296753, + "epoch": 0.041463414634146344, + "grad_norm": 2.2277114391326904, + "learning_rate": 1.951219512195122e-06, + "loss": 2.4667, + "mean_token_accuracy": 0.5292628407478333, + "num_tokens": 230952.0, + "step": 17 + }, + { + "entropy": 1.3334211111068726, + "epoch": 0.04390243902439024, + "grad_norm": 2.170741081237793, + "learning_rate": 2.073170731707317e-06, + "loss": 2.4062, + "mean_token_accuracy": 0.5387417078018188, + "num_tokens": 244971.0, + "step": 18 + }, + { + "entropy": 1.338529109954834, + "epoch": 0.046341463414634146, + "grad_norm": 2.3073835372924805, + "learning_rate": 2.1951219512195125e-06, + "loss": 2.456, + "mean_token_accuracy": 0.5352984666824341, + "num_tokens": 258557.0, + "step": 19 + }, + { + "entropy": 1.347283124923706, + "epoch": 0.04878048780487805, + "grad_norm": 2.3032565116882324, + "learning_rate": 2.317073170731708e-06, + "loss": 2.4649, + "mean_token_accuracy": 0.5279008150100708, + "num_tokens": 272121.0, + "step": 20 + }, + { + "entropy": 1.3316829204559326, + "epoch": 0.05121951219512195, + "grad_norm": 2.3000826835632324, + "learning_rate": 2.4390243902439027e-06, + "loss": 2.4087, + "mean_token_accuracy": 0.5390345454216003, + "num_tokens": 285830.0, + "step": 21 + }, + { + "entropy": 1.329493761062622, + "epoch": 0.05365853658536585, + "grad_norm": 2.2780513763427734, + "learning_rate": 2.5609756097560977e-06, + "loss": 2.4079, + "mean_token_accuracy": 0.54287189245224, + "num_tokens": 299398.0, + "step": 22 + }, + { + "entropy": 1.3588396310806274, + "epoch": 0.05609756097560976, + "grad_norm": 2.227916955947876, + "learning_rate": 2.682926829268293e-06, + "loss": 2.3605, + "mean_token_accuracy": 0.5377548933029175, + "num_tokens": 313783.0, + "step": 23 + }, + { + "entropy": 1.3629426956176758, + "epoch": 0.05853658536585366, + "grad_norm": 2.3609344959259033, + "learning_rate": 2.8048780487804884e-06, + "loss": 2.3981, + "mean_token_accuracy": 0.53337162733078, + "num_tokens": 327718.0, + "step": 24 + }, + { + "entropy": 1.3671380281448364, + "epoch": 0.06097560975609756, + "grad_norm": 2.4124526977539062, + "learning_rate": 2.926829268292683e-06, + "loss": 2.3878, + "mean_token_accuracy": 0.5356799364089966, + "num_tokens": 341257.0, + "step": 25 + }, + { + "entropy": 1.3848786354064941, + "epoch": 0.06341463414634146, + "grad_norm": 2.4480044841766357, + "learning_rate": 3.0487804878048782e-06, + "loss": 2.3921, + "mean_token_accuracy": 0.5313789248466492, + "num_tokens": 355088.0, + "step": 26 + }, + { + "entropy": 1.344458818435669, + "epoch": 0.06585365853658537, + "grad_norm": 2.3848752975463867, + "learning_rate": 3.1707317073170736e-06, + "loss": 2.3555, + "mean_token_accuracy": 0.5381656885147095, + "num_tokens": 368624.0, + "step": 27 + }, + { + "entropy": 1.350754976272583, + "epoch": 0.06829268292682927, + "grad_norm": 2.3436882495880127, + "learning_rate": 3.292682926829269e-06, + "loss": 2.3379, + "mean_token_accuracy": 0.5364958047866821, + "num_tokens": 382258.0, + "step": 28 + }, + { + "entropy": 1.370115876197815, + "epoch": 0.07073170731707316, + "grad_norm": 2.510852336883545, + "learning_rate": 3.414634146341464e-06, + "loss": 2.3477, + "mean_token_accuracy": 0.5365764498710632, + "num_tokens": 395944.0, + "step": 29 + }, + { + "entropy": 1.3533802032470703, + "epoch": 0.07317073170731707, + "grad_norm": 2.3963871002197266, + "learning_rate": 3.5365853658536588e-06, + "loss": 2.2746, + "mean_token_accuracy": 0.5382014513015747, + "num_tokens": 409860.0, + "step": 30 + }, + { + "entropy": 1.3506858348846436, + "epoch": 0.07560975609756097, + "grad_norm": 2.451052665710449, + "learning_rate": 3.6585365853658537e-06, + "loss": 2.3136, + "mean_token_accuracy": 0.5398555994033813, + "num_tokens": 423174.0, + "step": 31 + }, + { + "entropy": 1.3611050844192505, + "epoch": 0.07804878048780488, + "grad_norm": 2.2884960174560547, + "learning_rate": 3.780487804878049e-06, + "loss": 2.2667, + "mean_token_accuracy": 0.544750988483429, + "num_tokens": 437145.0, + "step": 32 + }, + { + "entropy": 1.3767828941345215, + "epoch": 0.08048780487804878, + "grad_norm": 2.235347270965576, + "learning_rate": 3.902439024390244e-06, + "loss": 2.2752, + "mean_token_accuracy": 0.5403356552124023, + "num_tokens": 450747.0, + "step": 33 + }, + { + "entropy": 1.3706166744232178, + "epoch": 0.08292682926829269, + "grad_norm": 2.2171695232391357, + "learning_rate": 4.024390243902439e-06, + "loss": 2.2598, + "mean_token_accuracy": 0.5407041907310486, + "num_tokens": 464140.0, + "step": 34 + }, + { + "entropy": 1.365933895111084, + "epoch": 0.08536585365853659, + "grad_norm": 2.0118744373321533, + "learning_rate": 4.146341463414634e-06, + "loss": 2.1878, + "mean_token_accuracy": 0.5497426986694336, + "num_tokens": 478148.0, + "step": 35 + }, + { + "entropy": 1.3866227865219116, + "epoch": 0.08780487804878048, + "grad_norm": 2.1824421882629395, + "learning_rate": 4.268292682926829e-06, + "loss": 2.2444, + "mean_token_accuracy": 0.540566623210907, + "num_tokens": 491648.0, + "step": 36 + }, + { + "entropy": 1.3471767902374268, + "epoch": 0.09024390243902439, + "grad_norm": 2.1132051944732666, + "learning_rate": 4.390243902439025e-06, + "loss": 2.1395, + "mean_token_accuracy": 0.5577816367149353, + "num_tokens": 505440.0, + "step": 37 + }, + { + "entropy": 1.3639988899230957, + "epoch": 0.09268292682926829, + "grad_norm": 2.2273340225219727, + "learning_rate": 4.51219512195122e-06, + "loss": 2.1572, + "mean_token_accuracy": 0.5511302351951599, + "num_tokens": 519391.0, + "step": 38 + }, + { + "entropy": 1.3907160758972168, + "epoch": 0.0951219512195122, + "grad_norm": 2.156001567840576, + "learning_rate": 4.634146341463416e-06, + "loss": 2.1303, + "mean_token_accuracy": 0.550815761089325, + "num_tokens": 533566.0, + "step": 39 + }, + { + "entropy": 1.4015930891036987, + "epoch": 0.0975609756097561, + "grad_norm": 2.23702335357666, + "learning_rate": 4.75609756097561e-06, + "loss": 2.1161, + "mean_token_accuracy": 0.5525180697441101, + "num_tokens": 547263.0, + "step": 40 + }, + { + "entropy": 1.4148375988006592, + "epoch": 0.1, + "grad_norm": 2.0875184535980225, + "learning_rate": 4.8780487804878055e-06, + "loss": 2.1296, + "mean_token_accuracy": 0.5491034984588623, + "num_tokens": 560832.0, + "step": 41 + }, + { + "entropy": 1.3900272846221924, + "epoch": 0.1024390243902439, + "grad_norm": 1.8485113382339478, + "learning_rate": 5e-06, + "loss": 2.0327, + "mean_token_accuracy": 0.5592963099479675, + "num_tokens": 575343.0, + "step": 42 + }, + { + "entropy": 1.4016730785369873, + "epoch": 0.1048780487804878, + "grad_norm": 1.9560102224349976, + "learning_rate": 5.121951219512195e-06, + "loss": 2.0757, + "mean_token_accuracy": 0.5556688904762268, + "num_tokens": 589083.0, + "step": 43 + }, + { + "entropy": 1.4096416234970093, + "epoch": 0.1073170731707317, + "grad_norm": 1.9796462059020996, + "learning_rate": 5.243902439024391e-06, + "loss": 2.0705, + "mean_token_accuracy": 0.5589586496353149, + "num_tokens": 602812.0, + "step": 44 + }, + { + "entropy": 1.3986564874649048, + "epoch": 0.10975609756097561, + "grad_norm": 1.8606263399124146, + "learning_rate": 5.365853658536586e-06, + "loss": 2.0227, + "mean_token_accuracy": 0.5688395500183105, + "num_tokens": 616512.0, + "step": 45 + }, + { + "entropy": 1.3871984481811523, + "epoch": 0.11219512195121951, + "grad_norm": 1.9525058269500732, + "learning_rate": 5.487804878048781e-06, + "loss": 2.0, + "mean_token_accuracy": 0.5685309767723083, + "num_tokens": 629938.0, + "step": 46 + }, + { + "entropy": 1.409059762954712, + "epoch": 0.11463414634146342, + "grad_norm": 1.9902775287628174, + "learning_rate": 5.609756097560977e-06, + "loss": 1.9671, + "mean_token_accuracy": 0.5718567371368408, + "num_tokens": 643634.0, + "step": 47 + }, + { + "entropy": 1.4133785963058472, + "epoch": 0.11707317073170732, + "grad_norm": 1.8982359170913696, + "learning_rate": 5.731707317073171e-06, + "loss": 1.95, + "mean_token_accuracy": 0.5621143579483032, + "num_tokens": 657101.0, + "step": 48 + }, + { + "entropy": 1.4381654262542725, + "epoch": 0.11951219512195121, + "grad_norm": 1.773857593536377, + "learning_rate": 5.853658536585366e-06, + "loss": 1.9087, + "mean_token_accuracy": 0.5688751935958862, + "num_tokens": 671048.0, + "step": 49 + }, + { + "entropy": 1.4031553268432617, + "epoch": 0.12195121951219512, + "grad_norm": 1.817074179649353, + "learning_rate": 5.9756097560975615e-06, + "loss": 1.8871, + "mean_token_accuracy": 0.5758497714996338, + "num_tokens": 684538.0, + "step": 50 + }, + { + "entropy": 1.4128111600875854, + "epoch": 0.12439024390243902, + "grad_norm": 1.7912412881851196, + "learning_rate": 6.0975609756097564e-06, + "loss": 1.8403, + "mean_token_accuracy": 0.5762076377868652, + "num_tokens": 698155.0, + "step": 51 + }, + { + "entropy": 1.4453322887420654, + "epoch": 0.12682926829268293, + "grad_norm": 1.7575210332870483, + "learning_rate": 6.219512195121951e-06, + "loss": 1.8211, + "mean_token_accuracy": 0.5743882060050964, + "num_tokens": 712024.0, + "step": 52 + }, + { + "entropy": 1.4498295783996582, + "epoch": 0.12926829268292683, + "grad_norm": 1.9219143390655518, + "learning_rate": 6.341463414634147e-06, + "loss": 1.8002, + "mean_token_accuracy": 0.57662433385849, + "num_tokens": 725815.0, + "step": 53 + }, + { + "entropy": 1.4735064506530762, + "epoch": 0.13170731707317074, + "grad_norm": 1.833960771560669, + "learning_rate": 6.463414634146342e-06, + "loss": 1.7672, + "mean_token_accuracy": 0.5728835463523865, + "num_tokens": 739675.0, + "step": 54 + }, + { + "entropy": 1.4403457641601562, + "epoch": 0.13414634146341464, + "grad_norm": 1.7752681970596313, + "learning_rate": 6.585365853658538e-06, + "loss": 1.7276, + "mean_token_accuracy": 0.5748123526573181, + "num_tokens": 753412.0, + "step": 55 + }, + { + "entropy": 1.4736558198928833, + "epoch": 0.13658536585365855, + "grad_norm": 1.6332184076309204, + "learning_rate": 6.707317073170733e-06, + "loss": 1.7191, + "mean_token_accuracy": 0.5755113363265991, + "num_tokens": 767704.0, + "step": 56 + }, + { + "entropy": 1.4722124338150024, + "epoch": 0.13902439024390245, + "grad_norm": 1.4422837495803833, + "learning_rate": 6.829268292682928e-06, + "loss": 1.6896, + "mean_token_accuracy": 0.5844070315361023, + "num_tokens": 781380.0, + "step": 57 + }, + { + "entropy": 1.471705675125122, + "epoch": 0.14146341463414633, + "grad_norm": 1.4130431413650513, + "learning_rate": 6.951219512195122e-06, + "loss": 1.6775, + "mean_token_accuracy": 0.579021692276001, + "num_tokens": 794848.0, + "step": 58 + }, + { + "entropy": 1.4454843997955322, + "epoch": 0.14390243902439023, + "grad_norm": 1.0664691925048828, + "learning_rate": 7.0731707317073175e-06, + "loss": 1.6227, + "mean_token_accuracy": 0.5937726497650146, + "num_tokens": 808674.0, + "step": 59 + }, + { + "entropy": 1.4620258808135986, + "epoch": 0.14634146341463414, + "grad_norm": 0.9696555137634277, + "learning_rate": 7.1951219512195125e-06, + "loss": 1.6272, + "mean_token_accuracy": 0.5890450477600098, + "num_tokens": 822492.0, + "step": 60 + }, + { + "entropy": 1.4692648649215698, + "epoch": 0.14878048780487804, + "grad_norm": 0.9255425930023193, + "learning_rate": 7.317073170731707e-06, + "loss": 1.6179, + "mean_token_accuracy": 0.5979862809181213, + "num_tokens": 836214.0, + "step": 61 + }, + { + "entropy": 1.447242259979248, + "epoch": 0.15121951219512195, + "grad_norm": 1.013901710510254, + "learning_rate": 7.439024390243903e-06, + "loss": 1.6052, + "mean_token_accuracy": 0.5923852324485779, + "num_tokens": 849625.0, + "step": 62 + }, + { + "entropy": 1.4720077514648438, + "epoch": 0.15365853658536585, + "grad_norm": 0.9797464609146118, + "learning_rate": 7.560975609756098e-06, + "loss": 1.6078, + "mean_token_accuracy": 0.5976994633674622, + "num_tokens": 863290.0, + "step": 63 + }, + { + "entropy": 1.4570248126983643, + "epoch": 0.15609756097560976, + "grad_norm": 0.8936851024627686, + "learning_rate": 7.682926829268293e-06, + "loss": 1.582, + "mean_token_accuracy": 0.5977935791015625, + "num_tokens": 877356.0, + "step": 64 + }, + { + "entropy": 1.4674687385559082, + "epoch": 0.15853658536585366, + "grad_norm": 0.8669166564941406, + "learning_rate": 7.804878048780489e-06, + "loss": 1.5665, + "mean_token_accuracy": 0.5966126322746277, + "num_tokens": 891247.0, + "step": 65 + }, + { + "entropy": 1.4274413585662842, + "epoch": 0.16097560975609757, + "grad_norm": 0.7330517172813416, + "learning_rate": 7.926829268292685e-06, + "loss": 1.5316, + "mean_token_accuracy": 0.6071735620498657, + "num_tokens": 905231.0, + "step": 66 + }, + { + "entropy": 1.4401626586914062, + "epoch": 0.16341463414634147, + "grad_norm": 0.690156102180481, + "learning_rate": 8.048780487804879e-06, + "loss": 1.531, + "mean_token_accuracy": 0.6064469218254089, + "num_tokens": 919052.0, + "step": 67 + }, + { + "entropy": 1.4117504358291626, + "epoch": 0.16585365853658537, + "grad_norm": 0.6785106062889099, + "learning_rate": 8.170731707317073e-06, + "loss": 1.5115, + "mean_token_accuracy": 0.611976146697998, + "num_tokens": 932812.0, + "step": 68 + }, + { + "entropy": 1.4311751127243042, + "epoch": 0.16829268292682928, + "grad_norm": 0.6311522722244263, + "learning_rate": 8.292682926829268e-06, + "loss": 1.5089, + "mean_token_accuracy": 0.6128394603729248, + "num_tokens": 946489.0, + "step": 69 + }, + { + "entropy": 1.4168494939804077, + "epoch": 0.17073170731707318, + "grad_norm": 0.6667038798332214, + "learning_rate": 8.414634146341464e-06, + "loss": 1.5077, + "mean_token_accuracy": 0.6107392311096191, + "num_tokens": 960100.0, + "step": 70 + }, + { + "entropy": 1.3982901573181152, + "epoch": 0.17317073170731706, + "grad_norm": 0.6248977184295654, + "learning_rate": 8.536585365853658e-06, + "loss": 1.4698, + "mean_token_accuracy": 0.6139276623725891, + "num_tokens": 974160.0, + "step": 71 + }, + { + "entropy": 1.413541555404663, + "epoch": 0.17560975609756097, + "grad_norm": 0.6343597173690796, + "learning_rate": 8.658536585365854e-06, + "loss": 1.4917, + "mean_token_accuracy": 0.6131435036659241, + "num_tokens": 987734.0, + "step": 72 + }, + { + "entropy": 1.4195303916931152, + "epoch": 0.17804878048780487, + "grad_norm": 0.6111395359039307, + "learning_rate": 8.78048780487805e-06, + "loss": 1.4663, + "mean_token_accuracy": 0.6168110370635986, + "num_tokens": 1001717.0, + "step": 73 + }, + { + "entropy": 1.40428626537323, + "epoch": 0.18048780487804877, + "grad_norm": 0.6199101805686951, + "learning_rate": 8.902439024390244e-06, + "loss": 1.4651, + "mean_token_accuracy": 0.6148563623428345, + "num_tokens": 1015276.0, + "step": 74 + }, + { + "entropy": 1.3776400089263916, + "epoch": 0.18292682926829268, + "grad_norm": 0.5942406058311462, + "learning_rate": 9.02439024390244e-06, + "loss": 1.4377, + "mean_token_accuracy": 0.6237963438034058, + "num_tokens": 1029000.0, + "step": 75 + }, + { + "entropy": 1.3991613388061523, + "epoch": 0.18536585365853658, + "grad_norm": 0.5812368392944336, + "learning_rate": 9.146341463414635e-06, + "loss": 1.4434, + "mean_token_accuracy": 0.6211662292480469, + "num_tokens": 1042547.0, + "step": 76 + }, + { + "entropy": 1.3818302154541016, + "epoch": 0.1878048780487805, + "grad_norm": 0.5597359538078308, + "learning_rate": 9.268292682926831e-06, + "loss": 1.427, + "mean_token_accuracy": 0.6249622702598572, + "num_tokens": 1055823.0, + "step": 77 + }, + { + "entropy": 1.3505898714065552, + "epoch": 0.1902439024390244, + "grad_norm": 0.5411974787712097, + "learning_rate": 9.390243902439025e-06, + "loss": 1.382, + "mean_token_accuracy": 0.6296323537826538, + "num_tokens": 1069412.0, + "step": 78 + }, + { + "entropy": 1.3784689903259277, + "epoch": 0.1926829268292683, + "grad_norm": 0.5365982055664062, + "learning_rate": 9.51219512195122e-06, + "loss": 1.4017, + "mean_token_accuracy": 0.6252776384353638, + "num_tokens": 1082934.0, + "step": 79 + }, + { + "entropy": 1.3679686784744263, + "epoch": 0.1951219512195122, + "grad_norm": 0.5300019383430481, + "learning_rate": 9.634146341463415e-06, + "loss": 1.402, + "mean_token_accuracy": 0.6283849477767944, + "num_tokens": 1096281.0, + "step": 80 + }, + { + "entropy": 1.3440438508987427, + "epoch": 0.1975609756097561, + "grad_norm": 0.5176823139190674, + "learning_rate": 9.756097560975611e-06, + "loss": 1.3634, + "mean_token_accuracy": 0.6327404975891113, + "num_tokens": 1110298.0, + "step": 81 + }, + { + "entropy": 1.3412446975708008, + "epoch": 0.2, + "grad_norm": 0.5266833305358887, + "learning_rate": 9.878048780487805e-06, + "loss": 1.3741, + "mean_token_accuracy": 0.6278718113899231, + "num_tokens": 1124199.0, + "step": 82 + }, + { + "entropy": 1.3485190868377686, + "epoch": 0.20243902439024392, + "grad_norm": 0.5412361025810242, + "learning_rate": 1e-05, + "loss": 1.3719, + "mean_token_accuracy": 0.6318795680999756, + "num_tokens": 1137697.0, + "step": 83 + }, + { + "entropy": 1.3585059642791748, + "epoch": 0.2048780487804878, + "grad_norm": 0.5267719626426697, + "learning_rate": 9.986449864498647e-06, + "loss": 1.3713, + "mean_token_accuracy": 0.6309584379196167, + "num_tokens": 1151454.0, + "step": 84 + }, + { + "entropy": 1.275984287261963, + "epoch": 0.2073170731707317, + "grad_norm": 0.5076718330383301, + "learning_rate": 9.972899728997291e-06, + "loss": 1.3172, + "mean_token_accuracy": 0.639977216720581, + "num_tokens": 1165533.0, + "step": 85 + }, + { + "entropy": 1.308915376663208, + "epoch": 0.2097560975609756, + "grad_norm": 0.5083034634590149, + "learning_rate": 9.959349593495936e-06, + "loss": 1.3337, + "mean_token_accuracy": 0.6340206265449524, + "num_tokens": 1179129.0, + "step": 86 + }, + { + "entropy": 1.3241677284240723, + "epoch": 0.2121951219512195, + "grad_norm": 0.4542701840400696, + "learning_rate": 9.94579945799458e-06, + "loss": 1.3551, + "mean_token_accuracy": 0.6313815116882324, + "num_tokens": 1192739.0, + "step": 87 + }, + { + "entropy": 1.314582347869873, + "epoch": 0.2146341463414634, + "grad_norm": 0.48066484928131104, + "learning_rate": 9.932249322493226e-06, + "loss": 1.337, + "mean_token_accuracy": 0.6390174627304077, + "num_tokens": 1206149.0, + "step": 88 + }, + { + "entropy": 1.3120391368865967, + "epoch": 0.21707317073170732, + "grad_norm": 0.4342902600765228, + "learning_rate": 9.91869918699187e-06, + "loss": 1.3378, + "mean_token_accuracy": 0.636769711971283, + "num_tokens": 1220492.0, + "step": 89 + }, + { + "entropy": 1.270530104637146, + "epoch": 0.21951219512195122, + "grad_norm": 0.43451136350631714, + "learning_rate": 9.905149051490516e-06, + "loss": 1.3036, + "mean_token_accuracy": 0.6418837904930115, + "num_tokens": 1234671.0, + "step": 90 + }, + { + "entropy": 1.2754387855529785, + "epoch": 0.22195121951219512, + "grad_norm": 0.4188333749771118, + "learning_rate": 9.89159891598916e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.641899585723877, + "num_tokens": 1248711.0, + "step": 91 + }, + { + "entropy": 1.2386054992675781, + "epoch": 0.22439024390243903, + "grad_norm": 0.4193291962146759, + "learning_rate": 9.878048780487805e-06, + "loss": 1.2657, + "mean_token_accuracy": 0.6525153517723083, + "num_tokens": 1262224.0, + "step": 92 + }, + { + "entropy": 1.2513689994812012, + "epoch": 0.22682926829268293, + "grad_norm": 0.428231805562973, + "learning_rate": 9.864498644986451e-06, + "loss": 1.2844, + "mean_token_accuracy": 0.6457121968269348, + "num_tokens": 1275825.0, + "step": 93 + }, + { + "entropy": 1.266829252243042, + "epoch": 0.22926829268292684, + "grad_norm": 0.42669400572776794, + "learning_rate": 9.850948509485095e-06, + "loss": 1.2964, + "mean_token_accuracy": 0.6454920768737793, + "num_tokens": 1289395.0, + "step": 94 + }, + { + "entropy": 1.2397089004516602, + "epoch": 0.23170731707317074, + "grad_norm": 0.409270703792572, + "learning_rate": 9.837398373983741e-06, + "loss": 1.2499, + "mean_token_accuracy": 0.653494119644165, + "num_tokens": 1303177.0, + "step": 95 + }, + { + "entropy": 1.2290325164794922, + "epoch": 0.23414634146341465, + "grad_norm": 0.4218335747718811, + "learning_rate": 9.823848238482386e-06, + "loss": 1.2648, + "mean_token_accuracy": 0.6454984545707703, + "num_tokens": 1317066.0, + "step": 96 + }, + { + "entropy": 1.2639315128326416, + "epoch": 0.23658536585365852, + "grad_norm": 0.41416656970977783, + "learning_rate": 9.81029810298103e-06, + "loss": 1.2838, + "mean_token_accuracy": 0.6459396481513977, + "num_tokens": 1330800.0, + "step": 97 + }, + { + "entropy": 1.244269847869873, + "epoch": 0.23902439024390243, + "grad_norm": 0.40852072834968567, + "learning_rate": 9.796747967479675e-06, + "loss": 1.2441, + "mean_token_accuracy": 0.6508611440658569, + "num_tokens": 1344693.0, + "step": 98 + }, + { + "entropy": 1.2480473518371582, + "epoch": 0.24146341463414633, + "grad_norm": 0.41581472754478455, + "learning_rate": 9.78319783197832e-06, + "loss": 1.2642, + "mean_token_accuracy": 0.6478216648101807, + "num_tokens": 1358435.0, + "step": 99 + }, + { + "entropy": 1.2283647060394287, + "epoch": 0.24390243902439024, + "grad_norm": 0.40649130940437317, + "learning_rate": 9.769647696476967e-06, + "loss": 1.2457, + "mean_token_accuracy": 0.6516292691230774, + "num_tokens": 1371862.0, + "step": 100 + }, + { + "entropy": 1.2162929773330688, + "epoch": 0.24634146341463414, + "grad_norm": 0.40584856271743774, + "learning_rate": 9.756097560975611e-06, + "loss": 1.2369, + "mean_token_accuracy": 0.6583905816078186, + "num_tokens": 1385411.0, + "step": 101 + }, + { + "entropy": 1.208768606185913, + "epoch": 0.24878048780487805, + "grad_norm": 0.39489707350730896, + "learning_rate": 9.742547425474255e-06, + "loss": 1.2277, + "mean_token_accuracy": 0.6547741889953613, + "num_tokens": 1399021.0, + "step": 102 + }, + { + "entropy": 1.2198774814605713, + "epoch": 0.25121951219512195, + "grad_norm": 0.383748322725296, + "learning_rate": 9.7289972899729e-06, + "loss": 1.2335, + "mean_token_accuracy": 0.6584837436676025, + "num_tokens": 1412887.0, + "step": 103 + }, + { + "entropy": 1.221745252609253, + "epoch": 0.25365853658536586, + "grad_norm": 0.39412277936935425, + "learning_rate": 9.715447154471546e-06, + "loss": 1.2403, + "mean_token_accuracy": 0.6498458981513977, + "num_tokens": 1426854.0, + "step": 104 + }, + { + "entropy": 1.217738151550293, + "epoch": 0.25609756097560976, + "grad_norm": 0.3701169192790985, + "learning_rate": 9.70189701897019e-06, + "loss": 1.2242, + "mean_token_accuracy": 0.6566095948219299, + "num_tokens": 1440426.0, + "step": 105 + }, + { + "entropy": 1.1941392421722412, + "epoch": 0.25853658536585367, + "grad_norm": 0.371112585067749, + "learning_rate": 9.688346883468836e-06, + "loss": 1.2115, + "mean_token_accuracy": 0.6609773635864258, + "num_tokens": 1454152.0, + "step": 106 + }, + { + "entropy": 1.1990996599197388, + "epoch": 0.26097560975609757, + "grad_norm": 0.365815669298172, + "learning_rate": 9.67479674796748e-06, + "loss": 1.2128, + "mean_token_accuracy": 0.6610169410705566, + "num_tokens": 1467856.0, + "step": 107 + }, + { + "entropy": 1.1728260517120361, + "epoch": 0.2634146341463415, + "grad_norm": 0.3645389974117279, + "learning_rate": 9.661246612466125e-06, + "loss": 1.1996, + "mean_token_accuracy": 0.6662229895591736, + "num_tokens": 1481396.0, + "step": 108 + }, + { + "entropy": 1.1833685636520386, + "epoch": 0.2658536585365854, + "grad_norm": 0.36212098598480225, + "learning_rate": 9.64769647696477e-06, + "loss": 1.202, + "mean_token_accuracy": 0.6605650186538696, + "num_tokens": 1495288.0, + "step": 109 + }, + { + "entropy": 1.1809427738189697, + "epoch": 0.2682926829268293, + "grad_norm": 0.4562148153781891, + "learning_rate": 9.634146341463415e-06, + "loss": 1.1918, + "mean_token_accuracy": 0.6666184663772583, + "num_tokens": 1509141.0, + "step": 110 + }, + { + "entropy": 1.1492688655853271, + "epoch": 0.2707317073170732, + "grad_norm": 0.3741201162338257, + "learning_rate": 9.620596205962061e-06, + "loss": 1.1765, + "mean_token_accuracy": 0.671988844871521, + "num_tokens": 1522748.0, + "step": 111 + }, + { + "entropy": 1.1608304977416992, + "epoch": 0.2731707317073171, + "grad_norm": 0.5636874437332153, + "learning_rate": 9.607046070460706e-06, + "loss": 1.1628, + "mean_token_accuracy": 0.6660817265510559, + "num_tokens": 1536441.0, + "step": 112 + }, + { + "entropy": 1.2037889957427979, + "epoch": 0.275609756097561, + "grad_norm": 0.3735000491142273, + "learning_rate": 9.59349593495935e-06, + "loss": 1.2275, + "mean_token_accuracy": 0.6517032384872437, + "num_tokens": 1550313.0, + "step": 113 + }, + { + "entropy": 1.1596955060958862, + "epoch": 0.2780487804878049, + "grad_norm": 0.37646710872650146, + "learning_rate": 9.579945799457996e-06, + "loss": 1.1889, + "mean_token_accuracy": 0.6636276841163635, + "num_tokens": 1563930.0, + "step": 114 + }, + { + "entropy": 1.1520904302597046, + "epoch": 0.2804878048780488, + "grad_norm": 0.3705466687679291, + "learning_rate": 9.56639566395664e-06, + "loss": 1.1592, + "mean_token_accuracy": 0.671284556388855, + "num_tokens": 1576939.0, + "step": 115 + }, + { + "entropy": 1.1546409130096436, + "epoch": 0.28292682926829266, + "grad_norm": 0.3923504948616028, + "learning_rate": 9.552845528455286e-06, + "loss": 1.1643, + "mean_token_accuracy": 0.6669601202011108, + "num_tokens": 1590587.0, + "step": 116 + }, + { + "entropy": 1.1506261825561523, + "epoch": 0.28536585365853656, + "grad_norm": 0.3831369876861572, + "learning_rate": 9.53929539295393e-06, + "loss": 1.1585, + "mean_token_accuracy": 0.670432984828949, + "num_tokens": 1603790.0, + "step": 117 + }, + { + "entropy": 1.1624176502227783, + "epoch": 0.28780487804878047, + "grad_norm": 0.3876406252384186, + "learning_rate": 9.525745257452575e-06, + "loss": 1.1547, + "mean_token_accuracy": 0.6688105463981628, + "num_tokens": 1617333.0, + "step": 118 + }, + { + "entropy": 1.1381282806396484, + "epoch": 0.29024390243902437, + "grad_norm": 0.3799033761024475, + "learning_rate": 9.51219512195122e-06, + "loss": 1.1325, + "mean_token_accuracy": 0.6745951175689697, + "num_tokens": 1631181.0, + "step": 119 + }, + { + "entropy": 1.1472123861312866, + "epoch": 0.2926829268292683, + "grad_norm": 0.40002092719078064, + "learning_rate": 9.498644986449865e-06, + "loss": 1.131, + "mean_token_accuracy": 0.6772284507751465, + "num_tokens": 1644547.0, + "step": 120 + }, + { + "entropy": 1.1485984325408936, + "epoch": 0.2951219512195122, + "grad_norm": 0.41155263781547546, + "learning_rate": 9.485094850948512e-06, + "loss": 1.1314, + "mean_token_accuracy": 0.6737343072891235, + "num_tokens": 1658429.0, + "step": 121 + }, + { + "entropy": 1.1536389589309692, + "epoch": 0.2975609756097561, + "grad_norm": 0.39085325598716736, + "learning_rate": 9.471544715447156e-06, + "loss": 1.1478, + "mean_token_accuracy": 0.6709743738174438, + "num_tokens": 1672064.0, + "step": 122 + }, + { + "entropy": 1.1491937637329102, + "epoch": 0.3, + "grad_norm": 0.33603760600090027, + "learning_rate": 9.4579945799458e-06, + "loss": 1.1478, + "mean_token_accuracy": 0.6709640622138977, + "num_tokens": 1686197.0, + "step": 123 + }, + { + "entropy": 1.1482875347137451, + "epoch": 0.3024390243902439, + "grad_norm": 0.32015225291252136, + "learning_rate": 9.444444444444445e-06, + "loss": 1.138, + "mean_token_accuracy": 0.6719373464584351, + "num_tokens": 1700253.0, + "step": 124 + }, + { + "entropy": 1.153489589691162, + "epoch": 0.3048780487804878, + "grad_norm": 0.32207733392715454, + "learning_rate": 9.43089430894309e-06, + "loss": 1.1585, + "mean_token_accuracy": 0.6705389022827148, + "num_tokens": 1713870.0, + "step": 125 + }, + { + "entropy": 1.107184648513794, + "epoch": 0.3073170731707317, + "grad_norm": 0.3159145712852478, + "learning_rate": 9.417344173441735e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.6776368618011475, + "num_tokens": 1727529.0, + "step": 126 + }, + { + "entropy": 1.1219724416732788, + "epoch": 0.3097560975609756, + "grad_norm": 0.2986023426055908, + "learning_rate": 9.403794037940381e-06, + "loss": 1.1253, + "mean_token_accuracy": 0.673386812210083, + "num_tokens": 1742128.0, + "step": 127 + }, + { + "entropy": 1.1211440563201904, + "epoch": 0.3121951219512195, + "grad_norm": 0.33409249782562256, + "learning_rate": 9.390243902439025e-06, + "loss": 1.1226, + "mean_token_accuracy": 0.6810810565948486, + "num_tokens": 1755834.0, + "step": 128 + }, + { + "entropy": 1.1153686046600342, + "epoch": 0.3146341463414634, + "grad_norm": 0.29576805233955383, + "learning_rate": 9.37669376693767e-06, + "loss": 1.112, + "mean_token_accuracy": 0.6776238083839417, + "num_tokens": 1769418.0, + "step": 129 + }, + { + "entropy": 1.138454794883728, + "epoch": 0.3170731707317073, + "grad_norm": 0.28794384002685547, + "learning_rate": 9.363143631436316e-06, + "loss": 1.1295, + "mean_token_accuracy": 0.6768421530723572, + "num_tokens": 1783127.0, + "step": 130 + }, + { + "entropy": 1.0799577236175537, + "epoch": 0.3195121951219512, + "grad_norm": 0.3011602759361267, + "learning_rate": 9.34959349593496e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6856415867805481, + "num_tokens": 1796898.0, + "step": 131 + }, + { + "entropy": 1.1080092191696167, + "epoch": 0.32195121951219513, + "grad_norm": 0.28195154666900635, + "learning_rate": 9.336043360433606e-06, + "loss": 1.1034, + "mean_token_accuracy": 0.6803351044654846, + "num_tokens": 1810522.0, + "step": 132 + }, + { + "entropy": 1.0953476428985596, + "epoch": 0.32439024390243903, + "grad_norm": 0.2998756468296051, + "learning_rate": 9.32249322493225e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6815518140792847, + "num_tokens": 1824019.0, + "step": 133 + }, + { + "entropy": 1.1013312339782715, + "epoch": 0.32682926829268294, + "grad_norm": 0.3195270001888275, + "learning_rate": 9.308943089430895e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6816878914833069, + "num_tokens": 1837638.0, + "step": 134 + }, + { + "entropy": 1.097846508026123, + "epoch": 0.32926829268292684, + "grad_norm": 0.31614819169044495, + "learning_rate": 9.29539295392954e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6832810044288635, + "num_tokens": 1851357.0, + "step": 135 + }, + { + "entropy": 1.1029002666473389, + "epoch": 0.33170731707317075, + "grad_norm": 0.3023568391799927, + "learning_rate": 9.281842818428185e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.68416428565979, + "num_tokens": 1865032.0, + "step": 136 + }, + { + "entropy": 1.100654125213623, + "epoch": 0.33414634146341465, + "grad_norm": 0.30091455578804016, + "learning_rate": 9.268292682926831e-06, + "loss": 1.1102, + "mean_token_accuracy": 0.6823589205741882, + "num_tokens": 1878834.0, + "step": 137 + }, + { + "entropy": 1.0853767395019531, + "epoch": 0.33658536585365856, + "grad_norm": 0.28139373660087585, + "learning_rate": 9.254742547425476e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6865951418876648, + "num_tokens": 1892733.0, + "step": 138 + }, + { + "entropy": 1.1051311492919922, + "epoch": 0.33902439024390246, + "grad_norm": 0.2940380275249481, + "learning_rate": 9.24119241192412e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6799636483192444, + "num_tokens": 1907063.0, + "step": 139 + }, + { + "entropy": 1.1128774881362915, + "epoch": 0.34146341463414637, + "grad_norm": 0.2861708104610443, + "learning_rate": 9.227642276422764e-06, + "loss": 1.1021, + "mean_token_accuracy": 0.6820879578590393, + "num_tokens": 1921064.0, + "step": 140 + }, + { + "entropy": 1.1019363403320312, + "epoch": 0.3439024390243902, + "grad_norm": 0.2888548970222473, + "learning_rate": 9.21409214092141e-06, + "loss": 1.1021, + "mean_token_accuracy": 0.6837447881698608, + "num_tokens": 1934528.0, + "step": 141 + }, + { + "entropy": 1.0777734518051147, + "epoch": 0.3463414634146341, + "grad_norm": 0.3018644452095032, + "learning_rate": 9.200542005420055e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6913608312606812, + "num_tokens": 1947867.0, + "step": 142 + }, + { + "entropy": 1.0810433626174927, + "epoch": 0.348780487804878, + "grad_norm": 0.29657524824142456, + "learning_rate": 9.1869918699187e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6829285621643066, + "num_tokens": 1962088.0, + "step": 143 + }, + { + "entropy": 1.0838487148284912, + "epoch": 0.35121951219512193, + "grad_norm": 0.30069437623023987, + "learning_rate": 9.173441734417345e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6872337460517883, + "num_tokens": 1975718.0, + "step": 144 + }, + { + "entropy": 1.0671093463897705, + "epoch": 0.35365853658536583, + "grad_norm": 0.32158926129341125, + "learning_rate": 9.15989159891599e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6920549273490906, + "num_tokens": 1989428.0, + "step": 145 + }, + { + "entropy": 1.0684272050857544, + "epoch": 0.35609756097560974, + "grad_norm": 0.3996563255786896, + "learning_rate": 9.146341463414635e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6908498406410217, + "num_tokens": 2002952.0, + "step": 146 + }, + { + "entropy": 1.0824508666992188, + "epoch": 0.35853658536585364, + "grad_norm": 0.2806661128997803, + "learning_rate": 9.13279132791328e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.6818480491638184, + "num_tokens": 2016669.0, + "step": 147 + }, + { + "entropy": 1.077519178390503, + "epoch": 0.36097560975609755, + "grad_norm": 0.29812970757484436, + "learning_rate": 9.119241192411926e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6873586773872375, + "num_tokens": 2030394.0, + "step": 148 + }, + { + "entropy": 1.0674171447753906, + "epoch": 0.36341463414634145, + "grad_norm": 0.3090595006942749, + "learning_rate": 9.10569105691057e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6799144744873047, + "num_tokens": 2043972.0, + "step": 149 + }, + { + "entropy": 1.088179349899292, + "epoch": 0.36585365853658536, + "grad_norm": 0.3295019268989563, + "learning_rate": 9.092140921409215e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.6850416660308838, + "num_tokens": 2057793.0, + "step": 150 + }, + { + "entropy": 1.073317050933838, + "epoch": 0.36829268292682926, + "grad_norm": 0.31593307852745056, + "learning_rate": 9.07859078590786e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6870791912078857, + "num_tokens": 2071771.0, + "step": 151 + }, + { + "entropy": 1.070550799369812, + "epoch": 0.37073170731707317, + "grad_norm": 0.28268736600875854, + "learning_rate": 9.065040650406505e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6912903785705566, + "num_tokens": 2085473.0, + "step": 152 + }, + { + "entropy": 1.0618209838867188, + "epoch": 0.37317073170731707, + "grad_norm": 0.2891680598258972, + "learning_rate": 9.051490514905151e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6930386424064636, + "num_tokens": 2099380.0, + "step": 153 + }, + { + "entropy": 1.0524916648864746, + "epoch": 0.375609756097561, + "grad_norm": 0.31244540214538574, + "learning_rate": 9.037940379403795e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6923752427101135, + "num_tokens": 2113062.0, + "step": 154 + }, + { + "entropy": 1.0701351165771484, + "epoch": 0.3780487804878049, + "grad_norm": 0.2857276201248169, + "learning_rate": 9.02439024390244e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6889170408248901, + "num_tokens": 2126486.0, + "step": 155 + }, + { + "entropy": 1.0808346271514893, + "epoch": 0.3804878048780488, + "grad_norm": 0.30712175369262695, + "learning_rate": 9.010840108401084e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6845914721488953, + "num_tokens": 2140319.0, + "step": 156 + }, + { + "entropy": 1.0551326274871826, + "epoch": 0.3829268292682927, + "grad_norm": 0.29289886355400085, + "learning_rate": 8.99728997289973e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6889886260032654, + "num_tokens": 2154148.0, + "step": 157 + }, + { + "entropy": 1.044635534286499, + "epoch": 0.3853658536585366, + "grad_norm": 0.2924291491508484, + "learning_rate": 8.983739837398374e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6969406604766846, + "num_tokens": 2167729.0, + "step": 158 + }, + { + "entropy": 1.0424062013626099, + "epoch": 0.3878048780487805, + "grad_norm": 0.33836257457733154, + "learning_rate": 8.970189701897019e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6928502917289734, + "num_tokens": 2181354.0, + "step": 159 + }, + { + "entropy": 1.0367578268051147, + "epoch": 0.3902439024390244, + "grad_norm": 0.29433733224868774, + "learning_rate": 8.956639566395665e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6925205588340759, + "num_tokens": 2195101.0, + "step": 160 + }, + { + "entropy": 1.0720441341400146, + "epoch": 0.3926829268292683, + "grad_norm": 0.33182695508003235, + "learning_rate": 8.94308943089431e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6875677108764648, + "num_tokens": 2208960.0, + "step": 161 + }, + { + "entropy": 1.0333222150802612, + "epoch": 0.3951219512195122, + "grad_norm": 0.32103872299194336, + "learning_rate": 8.929539295392955e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6961764693260193, + "num_tokens": 2223073.0, + "step": 162 + }, + { + "entropy": 1.0387994050979614, + "epoch": 0.3975609756097561, + "grad_norm": 0.29320165514945984, + "learning_rate": 8.9159891598916e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6899910569190979, + "num_tokens": 2236537.0, + "step": 163 + }, + { + "entropy": 1.04783296585083, + "epoch": 0.4, + "grad_norm": 0.305354505777359, + "learning_rate": 8.902439024390244e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6935980319976807, + "num_tokens": 2250205.0, + "step": 164 + }, + { + "entropy": 1.0558719635009766, + "epoch": 0.4024390243902439, + "grad_norm": 0.3049624264240265, + "learning_rate": 8.888888888888888e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6879761815071106, + "num_tokens": 2264002.0, + "step": 165 + }, + { + "entropy": 1.0384637117385864, + "epoch": 0.40487804878048783, + "grad_norm": 0.30772659182548523, + "learning_rate": 8.875338753387534e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6911885738372803, + "num_tokens": 2277353.0, + "step": 166 + }, + { + "entropy": 1.0217058658599854, + "epoch": 0.4073170731707317, + "grad_norm": 0.29792320728302, + "learning_rate": 8.86178861788618e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6998187303543091, + "num_tokens": 2290611.0, + "step": 167 + }, + { + "entropy": 1.0324389934539795, + "epoch": 0.4097560975609756, + "grad_norm": 0.3367704451084137, + "learning_rate": 8.848238482384825e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6971227526664734, + "num_tokens": 2303695.0, + "step": 168 + }, + { + "entropy": 1.0322160720825195, + "epoch": 0.4121951219512195, + "grad_norm": 0.34955868124961853, + "learning_rate": 8.834688346883469e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6930278539657593, + "num_tokens": 2317810.0, + "step": 169 + }, + { + "entropy": 1.0324090719223022, + "epoch": 0.4146341463414634, + "grad_norm": 0.33210307359695435, + "learning_rate": 8.821138211382113e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6991522312164307, + "num_tokens": 2331627.0, + "step": 170 + }, + { + "entropy": 1.0623321533203125, + "epoch": 0.4170731707317073, + "grad_norm": 0.3067868947982788, + "learning_rate": 8.80758807588076e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6889886260032654, + "num_tokens": 2345456.0, + "step": 171 + }, + { + "entropy": 1.0518372058868408, + "epoch": 0.4195121951219512, + "grad_norm": 0.3096657395362854, + "learning_rate": 8.794037940379404e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6929498910903931, + "num_tokens": 2359486.0, + "step": 172 + }, + { + "entropy": 1.030791997909546, + "epoch": 0.4219512195121951, + "grad_norm": 0.3270096480846405, + "learning_rate": 8.78048780487805e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6991227865219116, + "num_tokens": 2373182.0, + "step": 173 + }, + { + "entropy": 1.0519341230392456, + "epoch": 0.424390243902439, + "grad_norm": 0.31375235319137573, + "learning_rate": 8.766937669376694e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6936818957328796, + "num_tokens": 2386968.0, + "step": 174 + }, + { + "entropy": 1.0423860549926758, + "epoch": 0.4268292682926829, + "grad_norm": 0.34139835834503174, + "learning_rate": 8.753387533875339e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.694669783115387, + "num_tokens": 2400792.0, + "step": 175 + }, + { + "entropy": 1.0326882600784302, + "epoch": 0.4292682926829268, + "grad_norm": 0.3083813190460205, + "learning_rate": 8.739837398373985e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6961446404457092, + "num_tokens": 2414581.0, + "step": 176 + }, + { + "entropy": 1.0297658443450928, + "epoch": 0.4317073170731707, + "grad_norm": 0.3456875681877136, + "learning_rate": 8.726287262872629e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6909603476524353, + "num_tokens": 2428071.0, + "step": 177 + }, + { + "entropy": 1.0350985527038574, + "epoch": 0.43414634146341463, + "grad_norm": 0.3167937397956848, + "learning_rate": 8.712737127371275e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6956335306167603, + "num_tokens": 2442080.0, + "step": 178 + }, + { + "entropy": 1.0534954071044922, + "epoch": 0.43658536585365854, + "grad_norm": 0.34160587191581726, + "learning_rate": 8.69918699186992e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6857367753982544, + "num_tokens": 2456062.0, + "step": 179 + }, + { + "entropy": 1.019012689590454, + "epoch": 0.43902439024390244, + "grad_norm": 0.3717818856239319, + "learning_rate": 8.685636856368564e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6978288292884827, + "num_tokens": 2469481.0, + "step": 180 + }, + { + "entropy": 1.0156989097595215, + "epoch": 0.44146341463414634, + "grad_norm": 0.3390960395336151, + "learning_rate": 8.67208672086721e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6981892585754395, + "num_tokens": 2483138.0, + "step": 181 + }, + { + "entropy": 1.0245683193206787, + "epoch": 0.44390243902439025, + "grad_norm": 0.3798343539237976, + "learning_rate": 8.658536585365854e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7003815174102783, + "num_tokens": 2496521.0, + "step": 182 + }, + { + "entropy": 1.0270142555236816, + "epoch": 0.44634146341463415, + "grad_norm": 0.5196627974510193, + "learning_rate": 8.6449864498645e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6966584920883179, + "num_tokens": 2509974.0, + "step": 183 + }, + { + "entropy": 1.028610348701477, + "epoch": 0.44878048780487806, + "grad_norm": 0.34398967027664185, + "learning_rate": 8.631436314363144e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6969505548477173, + "num_tokens": 2524222.0, + "step": 184 + }, + { + "entropy": 1.0180282592773438, + "epoch": 0.45121951219512196, + "grad_norm": 0.3196260631084442, + "learning_rate": 8.617886178861789e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7003494501113892, + "num_tokens": 2537974.0, + "step": 185 + }, + { + "entropy": 1.0140835046768188, + "epoch": 0.45365853658536587, + "grad_norm": 0.3342556655406952, + "learning_rate": 8.604336043360433e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.698606014251709, + "num_tokens": 2551620.0, + "step": 186 + }, + { + "entropy": 1.0282118320465088, + "epoch": 0.4560975609756098, + "grad_norm": 0.4843715727329254, + "learning_rate": 8.59078590785908e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6956273317337036, + "num_tokens": 2565632.0, + "step": 187 + }, + { + "entropy": 1.0135064125061035, + "epoch": 0.4585365853658537, + "grad_norm": 0.3793366849422455, + "learning_rate": 8.577235772357724e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6974882483482361, + "num_tokens": 2579264.0, + "step": 188 + }, + { + "entropy": 1.028588056564331, + "epoch": 0.4609756097560976, + "grad_norm": 0.40557751059532166, + "learning_rate": 8.56368563685637e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6939067840576172, + "num_tokens": 2592557.0, + "step": 189 + }, + { + "entropy": 1.0189435482025146, + "epoch": 0.4634146341463415, + "grad_norm": 0.3505721688270569, + "learning_rate": 8.550135501355014e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7037708759307861, + "num_tokens": 2605806.0, + "step": 190 + }, + { + "entropy": 1.0245192050933838, + "epoch": 0.4658536585365854, + "grad_norm": 0.35359427332878113, + "learning_rate": 8.536585365853658e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6959211826324463, + "num_tokens": 2619723.0, + "step": 191 + }, + { + "entropy": 1.0338459014892578, + "epoch": 0.4682926829268293, + "grad_norm": 0.397707998752594, + "learning_rate": 8.523035230352304e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7052842378616333, + "num_tokens": 2633213.0, + "step": 192 + }, + { + "entropy": 1.0224543809890747, + "epoch": 0.47073170731707314, + "grad_norm": 0.35734203457832336, + "learning_rate": 8.509485094850949e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6969306468963623, + "num_tokens": 2647206.0, + "step": 193 + }, + { + "entropy": 1.0173115730285645, + "epoch": 0.47317073170731705, + "grad_norm": 0.34693512320518494, + "learning_rate": 8.495934959349595e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6953300833702087, + "num_tokens": 2660991.0, + "step": 194 + }, + { + "entropy": 1.029531717300415, + "epoch": 0.47560975609756095, + "grad_norm": 0.3539057970046997, + "learning_rate": 8.482384823848239e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6989356875419617, + "num_tokens": 2674725.0, + "step": 195 + }, + { + "entropy": 1.0025036334991455, + "epoch": 0.47804878048780486, + "grad_norm": 0.37867292761802673, + "learning_rate": 8.468834688346883e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7012137770652771, + "num_tokens": 2688500.0, + "step": 196 + }, + { + "entropy": 1.015578269958496, + "epoch": 0.48048780487804876, + "grad_norm": 0.38133302330970764, + "learning_rate": 8.45528455284553e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7015800476074219, + "num_tokens": 2702503.0, + "step": 197 + }, + { + "entropy": 1.0281040668487549, + "epoch": 0.48292682926829267, + "grad_norm": 0.3827400505542755, + "learning_rate": 8.441734417344174e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6993362903594971, + "num_tokens": 2716079.0, + "step": 198 + }, + { + "entropy": 1.0232656002044678, + "epoch": 0.4853658536585366, + "grad_norm": 0.38402533531188965, + "learning_rate": 8.42818428184282e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6984891891479492, + "num_tokens": 2729995.0, + "step": 199 + }, + { + "entropy": 1.0109443664550781, + "epoch": 0.4878048780487805, + "grad_norm": 0.3873913884162903, + "learning_rate": 8.414634146341464e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7005252838134766, + "num_tokens": 2743528.0, + "step": 200 + }, + { + "entropy": 1.0173959732055664, + "epoch": 0.4902439024390244, + "grad_norm": 0.34564492106437683, + "learning_rate": 8.401084010840109e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7038676142692566, + "num_tokens": 2757325.0, + "step": 201 + }, + { + "entropy": 1.0393359661102295, + "epoch": 0.4926829268292683, + "grad_norm": 0.4502638578414917, + "learning_rate": 8.387533875338753e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6990713477134705, + "num_tokens": 2770909.0, + "step": 202 + }, + { + "entropy": 1.0125484466552734, + "epoch": 0.4951219512195122, + "grad_norm": 0.6057789921760559, + "learning_rate": 8.373983739837399e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7036902904510498, + "num_tokens": 2784745.0, + "step": 203 + }, + { + "entropy": 1.0163042545318604, + "epoch": 0.4975609756097561, + "grad_norm": 0.47165754437446594, + "learning_rate": 8.360433604336045e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.702346682548523, + "num_tokens": 2797971.0, + "step": 204 + }, + { + "entropy": 0.9835382103919983, + "epoch": 0.5, + "grad_norm": 0.38579699397087097, + "learning_rate": 8.34688346883469e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7067919969558716, + "num_tokens": 2811179.0, + "step": 205 + }, + { + "entropy": 0.9862759113311768, + "epoch": 0.5024390243902439, + "grad_norm": 0.3595806956291199, + "learning_rate": 8.333333333333334e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7074604630470276, + "num_tokens": 2824800.0, + "step": 206 + }, + { + "entropy": 0.9990562200546265, + "epoch": 0.5048780487804878, + "grad_norm": 0.37352100014686584, + "learning_rate": 8.319783197831978e-06, + "loss": 0.986, + "mean_token_accuracy": 0.702506422996521, + "num_tokens": 2838860.0, + "step": 207 + }, + { + "entropy": 1.0180875062942505, + "epoch": 0.5073170731707317, + "grad_norm": 0.35853612422943115, + "learning_rate": 8.306233062330624e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7018040418624878, + "num_tokens": 2853011.0, + "step": 208 + }, + { + "entropy": 1.0083836317062378, + "epoch": 0.5097560975609756, + "grad_norm": 0.3947008550167084, + "learning_rate": 8.292682926829268e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7039703726768494, + "num_tokens": 2867056.0, + "step": 209 + }, + { + "entropy": 0.9842475056648254, + "epoch": 0.5121951219512195, + "grad_norm": 0.4123150408267975, + "learning_rate": 8.279132791327915e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7054763436317444, + "num_tokens": 2880256.0, + "step": 210 + }, + { + "entropy": 1.0007009506225586, + "epoch": 0.5146341463414634, + "grad_norm": 0.3687898814678192, + "learning_rate": 8.265582655826559e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7061342000961304, + "num_tokens": 2894520.0, + "step": 211 + }, + { + "entropy": 1.0035302639007568, + "epoch": 0.5170731707317073, + "grad_norm": 0.42165127396583557, + "learning_rate": 8.252032520325203e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6972360014915466, + "num_tokens": 2907850.0, + "step": 212 + }, + { + "entropy": 1.0078985691070557, + "epoch": 0.5195121951219512, + "grad_norm": 0.36786288022994995, + "learning_rate": 8.23848238482385e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.702234148979187, + "num_tokens": 2921652.0, + "step": 213 + }, + { + "entropy": 0.994750440120697, + "epoch": 0.5219512195121951, + "grad_norm": 0.3809518814086914, + "learning_rate": 8.224932249322494e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7052788138389587, + "num_tokens": 2935800.0, + "step": 214 + }, + { + "entropy": 0.9751050472259521, + "epoch": 0.524390243902439, + "grad_norm": 0.45397570729255676, + "learning_rate": 8.21138211382114e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.712651252746582, + "num_tokens": 2949451.0, + "step": 215 + }, + { + "entropy": 1.0001088380813599, + "epoch": 0.526829268292683, + "grad_norm": 0.36515524983406067, + "learning_rate": 8.197831978319784e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7006496787071228, + "num_tokens": 2963474.0, + "step": 216 + }, + { + "entropy": 1.0082523822784424, + "epoch": 0.5292682926829269, + "grad_norm": 0.3793857991695404, + "learning_rate": 8.184281842818428e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6996766328811646, + "num_tokens": 2977405.0, + "step": 217 + }, + { + "entropy": 0.9938634634017944, + "epoch": 0.5317073170731708, + "grad_norm": 0.43802475929260254, + "learning_rate": 8.170731707317073e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6989560127258301, + "num_tokens": 2990831.0, + "step": 218 + }, + { + "entropy": 0.989290177822113, + "epoch": 0.5341463414634147, + "grad_norm": 0.3721674978733063, + "learning_rate": 8.157181571815719e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7013733983039856, + "num_tokens": 3004754.0, + "step": 219 + }, + { + "entropy": 0.9925138354301453, + "epoch": 0.5365853658536586, + "grad_norm": 0.4069049656391144, + "learning_rate": 8.143631436314365e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7074543833732605, + "num_tokens": 3018091.0, + "step": 220 + }, + { + "entropy": 0.9971339702606201, + "epoch": 0.5390243902439025, + "grad_norm": 0.3578735291957855, + "learning_rate": 8.130081300813009e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7043969035148621, + "num_tokens": 3031571.0, + "step": 221 + }, + { + "entropy": 0.9838342666625977, + "epoch": 0.5414634146341464, + "grad_norm": 0.38385337591171265, + "learning_rate": 8.116531165311653e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7182108163833618, + "num_tokens": 3045381.0, + "step": 222 + }, + { + "entropy": 0.9965362548828125, + "epoch": 0.5439024390243903, + "grad_norm": 0.396474689245224, + "learning_rate": 8.102981029810298e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.703424870967865, + "num_tokens": 3058945.0, + "step": 223 + }, + { + "entropy": 0.9957606792449951, + "epoch": 0.5463414634146342, + "grad_norm": 0.35657909512519836, + "learning_rate": 8.089430894308944e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7055048942565918, + "num_tokens": 3072676.0, + "step": 224 + }, + { + "entropy": 0.9773147106170654, + "epoch": 0.5487804878048781, + "grad_norm": 0.35940998792648315, + "learning_rate": 8.075880758807588e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7097449898719788, + "num_tokens": 3086535.0, + "step": 225 + }, + { + "entropy": 0.9726331830024719, + "epoch": 0.551219512195122, + "grad_norm": 0.37580767273902893, + "learning_rate": 8.062330623306234e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7098819017410278, + "num_tokens": 3100435.0, + "step": 226 + }, + { + "entropy": 0.9689302444458008, + "epoch": 0.5536585365853659, + "grad_norm": 0.4062517285346985, + "learning_rate": 8.048780487804879e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7093740701675415, + "num_tokens": 3113743.0, + "step": 227 + }, + { + "entropy": 0.9909324049949646, + "epoch": 0.5560975609756098, + "grad_norm": 0.4221988022327423, + "learning_rate": 8.035230352303523e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7099443674087524, + "num_tokens": 3127415.0, + "step": 228 + }, + { + "entropy": 0.9781975746154785, + "epoch": 0.5585365853658537, + "grad_norm": 0.41590189933776855, + "learning_rate": 8.021680216802169e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7079962491989136, + "num_tokens": 3141400.0, + "step": 229 + }, + { + "entropy": 0.9742963314056396, + "epoch": 0.5609756097560976, + "grad_norm": 0.41367363929748535, + "learning_rate": 8.008130081300813e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7071376442909241, + "num_tokens": 3155146.0, + "step": 230 + }, + { + "entropy": 0.9977781772613525, + "epoch": 0.5634146341463414, + "grad_norm": 0.41266530752182007, + "learning_rate": 7.99457994579946e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7003742456436157, + "num_tokens": 3169323.0, + "step": 231 + }, + { + "entropy": 0.9736911058425903, + "epoch": 0.5658536585365853, + "grad_norm": 0.4872662127017975, + "learning_rate": 7.981029810298104e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7085639238357544, + "num_tokens": 3183071.0, + "step": 232 + }, + { + "entropy": 0.9661515355110168, + "epoch": 0.5682926829268292, + "grad_norm": 0.4976981580257416, + "learning_rate": 7.967479674796748e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7196811437606812, + "num_tokens": 3196511.0, + "step": 233 + }, + { + "entropy": 0.9788615107536316, + "epoch": 0.5707317073170731, + "grad_norm": 0.40486401319503784, + "learning_rate": 7.953929539295394e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7100006937980652, + "num_tokens": 3210496.0, + "step": 234 + }, + { + "entropy": 0.9701911211013794, + "epoch": 0.573170731707317, + "grad_norm": 0.3933921158313751, + "learning_rate": 7.940379403794039e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7084203958511353, + "num_tokens": 3224395.0, + "step": 235 + }, + { + "entropy": 0.9639068841934204, + "epoch": 0.5756097560975609, + "grad_norm": 0.4816792607307434, + "learning_rate": 7.926829268292685e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7153215408325195, + "num_tokens": 3238065.0, + "step": 236 + }, + { + "entropy": 0.9722325801849365, + "epoch": 0.5780487804878048, + "grad_norm": 0.5372746586799622, + "learning_rate": 7.913279132791329e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7086333632469177, + "num_tokens": 3251552.0, + "step": 237 + }, + { + "entropy": 1.0175796747207642, + "epoch": 0.5804878048780487, + "grad_norm": 0.4492548406124115, + "learning_rate": 7.899728997289973e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7006236910820007, + "num_tokens": 3265517.0, + "step": 238 + }, + { + "entropy": 0.9735410213470459, + "epoch": 0.5829268292682926, + "grad_norm": 0.8624438643455505, + "learning_rate": 7.886178861788618e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.709635853767395, + "num_tokens": 3279512.0, + "step": 239 + }, + { + "entropy": 0.9796257615089417, + "epoch": 0.5853658536585366, + "grad_norm": 0.4248017370700836, + "learning_rate": 7.872628726287264e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.708400547504425, + "num_tokens": 3293170.0, + "step": 240 + }, + { + "entropy": 0.9722896218299866, + "epoch": 0.5878048780487805, + "grad_norm": 0.3795738220214844, + "learning_rate": 7.859078590785908e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7118780612945557, + "num_tokens": 3306833.0, + "step": 241 + }, + { + "entropy": 0.9680063128471375, + "epoch": 0.5902439024390244, + "grad_norm": 0.3923177719116211, + "learning_rate": 7.845528455284554e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7090327143669128, + "num_tokens": 3320909.0, + "step": 242 + }, + { + "entropy": 0.9560147523880005, + "epoch": 0.5926829268292683, + "grad_norm": 0.41641271114349365, + "learning_rate": 7.831978319783198e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7116286158561707, + "num_tokens": 3334581.0, + "step": 243 + }, + { + "entropy": 0.9750783443450928, + "epoch": 0.5951219512195122, + "grad_norm": 0.3982071578502655, + "learning_rate": 7.818428184281843e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7131319046020508, + "num_tokens": 3348464.0, + "step": 244 + }, + { + "entropy": 0.9652288556098938, + "epoch": 0.5975609756097561, + "grad_norm": 0.4358637034893036, + "learning_rate": 7.804878048780489e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.716895341873169, + "num_tokens": 3361892.0, + "step": 245 + }, + { + "entropy": 0.9535477161407471, + "epoch": 0.6, + "grad_norm": 0.4099048972129822, + "learning_rate": 7.791327913279133e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7090840935707092, + "num_tokens": 3375283.0, + "step": 246 + }, + { + "entropy": 0.9606568813323975, + "epoch": 0.6024390243902439, + "grad_norm": 0.3803389370441437, + "learning_rate": 7.77777777777778e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7069713473320007, + "num_tokens": 3389127.0, + "step": 247 + }, + { + "entropy": 0.9476461410522461, + "epoch": 0.6048780487804878, + "grad_norm": 0.4176231026649475, + "learning_rate": 7.764227642276424e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7153881192207336, + "num_tokens": 3402361.0, + "step": 248 + }, + { + "entropy": 0.9819697141647339, + "epoch": 0.6073170731707317, + "grad_norm": 0.3921288251876831, + "learning_rate": 7.750677506775068e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7068524956703186, + "num_tokens": 3416080.0, + "step": 249 + }, + { + "entropy": 0.976287841796875, + "epoch": 0.6097560975609756, + "grad_norm": 0.40087559819221497, + "learning_rate": 7.737127371273714e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.709733247756958, + "num_tokens": 3429966.0, + "step": 250 + }, + { + "entropy": 0.9725978374481201, + "epoch": 0.6121951219512195, + "grad_norm": 0.48437660932540894, + "learning_rate": 7.723577235772358e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7101976871490479, + "num_tokens": 3443995.0, + "step": 251 + }, + { + "entropy": 0.9459497332572937, + "epoch": 0.6146341463414634, + "grad_norm": 0.5235543847084045, + "learning_rate": 7.710027100271004e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7215677499771118, + "num_tokens": 3457508.0, + "step": 252 + }, + { + "entropy": 0.9774629473686218, + "epoch": 0.6170731707317073, + "grad_norm": 0.3972068130970001, + "learning_rate": 7.696476964769649e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7105010151863098, + "num_tokens": 3471037.0, + "step": 253 + }, + { + "entropy": 0.9459494352340698, + "epoch": 0.6195121951219512, + "grad_norm": 0.4803354740142822, + "learning_rate": 7.682926829268293e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7180070281028748, + "num_tokens": 3484681.0, + "step": 254 + }, + { + "entropy": 0.9822655916213989, + "epoch": 0.6219512195121951, + "grad_norm": 0.5377360582351685, + "learning_rate": 7.669376693766937e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7102938890457153, + "num_tokens": 3498511.0, + "step": 255 + }, + { + "entropy": 0.9723156690597534, + "epoch": 0.624390243902439, + "grad_norm": 0.4719409644603729, + "learning_rate": 7.655826558265583e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7167521715164185, + "num_tokens": 3512370.0, + "step": 256 + }, + { + "entropy": 0.9557574391365051, + "epoch": 0.6268292682926829, + "grad_norm": 0.4471491575241089, + "learning_rate": 7.64227642276423e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7144047617912292, + "num_tokens": 3525583.0, + "step": 257 + }, + { + "entropy": 0.974261999130249, + "epoch": 0.6292682926829268, + "grad_norm": 0.4518379271030426, + "learning_rate": 7.628726287262873e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7040947079658508, + "num_tokens": 3539617.0, + "step": 258 + }, + { + "entropy": 0.9723663330078125, + "epoch": 0.6317073170731707, + "grad_norm": 0.45028042793273926, + "learning_rate": 7.615176151761519e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7073206305503845, + "num_tokens": 3553416.0, + "step": 259 + }, + { + "entropy": 0.9442059993743896, + "epoch": 0.6341463414634146, + "grad_norm": 0.5071269273757935, + "learning_rate": 7.601626016260163e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7091909646987915, + "num_tokens": 3566695.0, + "step": 260 + }, + { + "entropy": 0.9819319248199463, + "epoch": 0.6365853658536585, + "grad_norm": 0.45541971921920776, + "learning_rate": 7.5880758807588085e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7078201770782471, + "num_tokens": 3580189.0, + "step": 261 + }, + { + "entropy": 0.9500574469566345, + "epoch": 0.6390243902439025, + "grad_norm": 0.5451672673225403, + "learning_rate": 7.574525745257453e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7172244191169739, + "num_tokens": 3593622.0, + "step": 262 + }, + { + "entropy": 0.965298056602478, + "epoch": 0.6414634146341464, + "grad_norm": 0.45017296075820923, + "learning_rate": 7.560975609756098e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7146288156509399, + "num_tokens": 3606961.0, + "step": 263 + }, + { + "entropy": 0.9539521932601929, + "epoch": 0.6439024390243903, + "grad_norm": 0.47731754183769226, + "learning_rate": 7.547425474254744e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7130581736564636, + "num_tokens": 3620593.0, + "step": 264 + }, + { + "entropy": 0.9426309466362, + "epoch": 0.6463414634146342, + "grad_norm": 0.5125877857208252, + "learning_rate": 7.5338753387533885e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7107785940170288, + "num_tokens": 3634377.0, + "step": 265 + }, + { + "entropy": 0.968903660774231, + "epoch": 0.6487804878048781, + "grad_norm": 0.5131202340126038, + "learning_rate": 7.520325203252034e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7136064767837524, + "num_tokens": 3648695.0, + "step": 266 + }, + { + "entropy": 0.9822129011154175, + "epoch": 0.651219512195122, + "grad_norm": 0.45986267924308777, + "learning_rate": 7.506775067750678e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7104863524436951, + "num_tokens": 3662548.0, + "step": 267 + }, + { + "entropy": 0.9548493027687073, + "epoch": 0.6536585365853659, + "grad_norm": 0.4756591320037842, + "learning_rate": 7.493224932249323e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.706981897354126, + "num_tokens": 3676099.0, + "step": 268 + }, + { + "entropy": 0.9379334449768066, + "epoch": 0.6560975609756098, + "grad_norm": 0.7421849370002747, + "learning_rate": 7.4796747967479676e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.714191734790802, + "num_tokens": 3689799.0, + "step": 269 + }, + { + "entropy": 0.9444692730903625, + "epoch": 0.6585365853658537, + "grad_norm": 0.7555662393569946, + "learning_rate": 7.466124661246613e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7124654054641724, + "num_tokens": 3703549.0, + "step": 270 + }, + { + "entropy": 0.964323878288269, + "epoch": 0.6609756097560976, + "grad_norm": 0.4882325232028961, + "learning_rate": 7.452574525745257e-06, + "loss": 0.954, + "mean_token_accuracy": 0.714160144329071, + "num_tokens": 3717216.0, + "step": 271 + }, + { + "entropy": 0.9607169032096863, + "epoch": 0.6634146341463415, + "grad_norm": 0.6977524757385254, + "learning_rate": 7.439024390243903e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7098829746246338, + "num_tokens": 3730730.0, + "step": 272 + }, + { + "entropy": 0.9752838611602783, + "epoch": 0.6658536585365854, + "grad_norm": 0.48511064052581787, + "learning_rate": 7.425474254742548e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7046083807945251, + "num_tokens": 3744829.0, + "step": 273 + }, + { + "entropy": 0.9718318581581116, + "epoch": 0.6682926829268293, + "grad_norm": 0.4946161210536957, + "learning_rate": 7.411924119241193e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7096002697944641, + "num_tokens": 3758230.0, + "step": 274 + }, + { + "entropy": 0.9614427089691162, + "epoch": 0.6707317073170732, + "grad_norm": 0.47201916575431824, + "learning_rate": 7.398373983739838e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7111756205558777, + "num_tokens": 3772026.0, + "step": 275 + }, + { + "entropy": 0.9677258729934692, + "epoch": 0.6731707317073171, + "grad_norm": 0.4112757742404938, + "learning_rate": 7.384823848238482e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7081936597824097, + "num_tokens": 3786065.0, + "step": 276 + }, + { + "entropy": 0.9812069535255432, + "epoch": 0.675609756097561, + "grad_norm": 0.5289693474769592, + "learning_rate": 7.371273712737128e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7038695812225342, + "num_tokens": 3799700.0, + "step": 277 + }, + { + "entropy": 0.9499857425689697, + "epoch": 0.6780487804878049, + "grad_norm": 0.4317754805088043, + "learning_rate": 7.357723577235773e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7191169857978821, + "num_tokens": 3813170.0, + "step": 278 + }, + { + "entropy": 0.9827193021774292, + "epoch": 0.6804878048780488, + "grad_norm": 0.5651706457138062, + "learning_rate": 7.344173441734418e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7127862572669983, + "num_tokens": 3826810.0, + "step": 279 + }, + { + "entropy": 0.9605240821838379, + "epoch": 0.6829268292682927, + "grad_norm": 0.4189467430114746, + "learning_rate": 7.330623306233063e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7105111479759216, + "num_tokens": 3840716.0, + "step": 280 + }, + { + "entropy": 0.9565154314041138, + "epoch": 0.6853658536585366, + "grad_norm": 0.41188377141952515, + "learning_rate": 7.317073170731707e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7180280685424805, + "num_tokens": 3854627.0, + "step": 281 + }, + { + "entropy": 0.9569879770278931, + "epoch": 0.6878048780487804, + "grad_norm": 0.4937061369419098, + "learning_rate": 7.303523035230353e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7244262099266052, + "num_tokens": 3867801.0, + "step": 282 + }, + { + "entropy": 0.964173436164856, + "epoch": 0.6902439024390243, + "grad_norm": 0.4437481164932251, + "learning_rate": 7.289972899728998e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.717008113861084, + "num_tokens": 3881093.0, + "step": 283 + }, + { + "entropy": 0.9317843914031982, + "epoch": 0.6926829268292682, + "grad_norm": 0.47311607003211975, + "learning_rate": 7.276422764227643e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7166216373443604, + "num_tokens": 3894808.0, + "step": 284 + }, + { + "entropy": 0.9279685616493225, + "epoch": 0.6951219512195121, + "grad_norm": 0.48283323645591736, + "learning_rate": 7.262872628726287e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7190238237380981, + "num_tokens": 3908633.0, + "step": 285 + }, + { + "entropy": 0.9323222637176514, + "epoch": 0.697560975609756, + "grad_norm": 0.41079843044281006, + "learning_rate": 7.2493224932249325e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7223949432373047, + "num_tokens": 3922478.0, + "step": 286 + }, + { + "entropy": 0.9287464618682861, + "epoch": 0.7, + "grad_norm": 0.44754576683044434, + "learning_rate": 7.2357723577235786e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7185174226760864, + "num_tokens": 3936200.0, + "step": 287 + }, + { + "entropy": 0.9283928871154785, + "epoch": 0.7024390243902439, + "grad_norm": 0.4761507511138916, + "learning_rate": 7.222222222222223e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7208345532417297, + "num_tokens": 3949828.0, + "step": 288 + }, + { + "entropy": 0.9376824498176575, + "epoch": 0.7048780487804878, + "grad_norm": 0.4816683828830719, + "learning_rate": 7.208672086720868e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7185068726539612, + "num_tokens": 3963855.0, + "step": 289 + }, + { + "entropy": 0.9271779656410217, + "epoch": 0.7073170731707317, + "grad_norm": 0.45386019349098206, + "learning_rate": 7.1951219512195125e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7165395021438599, + "num_tokens": 3977499.0, + "step": 290 + }, + { + "entropy": 0.9511876106262207, + "epoch": 0.7097560975609756, + "grad_norm": 0.43612465262413025, + "learning_rate": 7.181571815718158e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7135776281356812, + "num_tokens": 3990831.0, + "step": 291 + }, + { + "entropy": 0.934224009513855, + "epoch": 0.7121951219512195, + "grad_norm": 0.4678195118904114, + "learning_rate": 7.168021680216802e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7236147522926331, + "num_tokens": 4004491.0, + "step": 292 + }, + { + "entropy": 0.9473061561584473, + "epoch": 0.7146341463414634, + "grad_norm": 0.44049718976020813, + "learning_rate": 7.154471544715448e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7178494334220886, + "num_tokens": 4017936.0, + "step": 293 + }, + { + "entropy": 0.9487972259521484, + "epoch": 0.7170731707317073, + "grad_norm": 0.5107514262199402, + "learning_rate": 7.140921409214093e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7122842073440552, + "num_tokens": 4032084.0, + "step": 294 + }, + { + "entropy": 0.9367107152938843, + "epoch": 0.7195121951219512, + "grad_norm": 0.4118102490901947, + "learning_rate": 7.127371273712738e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7204485535621643, + "num_tokens": 4046101.0, + "step": 295 + }, + { + "entropy": 0.9352602958679199, + "epoch": 0.7219512195121951, + "grad_norm": 0.45560237765312195, + "learning_rate": 7.113821138211383e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7115190029144287, + "num_tokens": 4059955.0, + "step": 296 + }, + { + "entropy": 0.9381440877914429, + "epoch": 0.724390243902439, + "grad_norm": 0.4506898522377014, + "learning_rate": 7.100271002710027e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7140668034553528, + "num_tokens": 4073677.0, + "step": 297 + }, + { + "entropy": 0.931757926940918, + "epoch": 0.7268292682926829, + "grad_norm": 0.47680553793907166, + "learning_rate": 7.086720867208673e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.720937192440033, + "num_tokens": 4087052.0, + "step": 298 + }, + { + "entropy": 0.9337366819381714, + "epoch": 0.7292682926829268, + "grad_norm": 0.4333563446998596, + "learning_rate": 7.0731707317073175e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7173944711685181, + "num_tokens": 4100854.0, + "step": 299 + }, + { + "entropy": 0.9619350433349609, + "epoch": 0.7317073170731707, + "grad_norm": 0.45877236127853394, + "learning_rate": 7.059620596205963e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7089931964874268, + "num_tokens": 4114825.0, + "step": 300 + }, + { + "entropy": 0.9500558376312256, + "epoch": 0.7341463414634146, + "grad_norm": 0.608589231967926, + "learning_rate": 7.046070460704607e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7173042893409729, + "num_tokens": 4128329.0, + "step": 301 + }, + { + "entropy": 0.9519375562667847, + "epoch": 0.7365853658536585, + "grad_norm": 0.5312452912330627, + "learning_rate": 7.032520325203252e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7112184166908264, + "num_tokens": 4142224.0, + "step": 302 + }, + { + "entropy": 0.9525036811828613, + "epoch": 0.7390243902439024, + "grad_norm": 0.48584797978401184, + "learning_rate": 7.018970189701898e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.713076114654541, + "num_tokens": 4155822.0, + "step": 303 + }, + { + "entropy": 0.9396979212760925, + "epoch": 0.7414634146341463, + "grad_norm": 0.61397385597229, + "learning_rate": 7.005420054200543e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7213865518569946, + "num_tokens": 4169599.0, + "step": 304 + }, + { + "entropy": 0.9650927782058716, + "epoch": 0.7439024390243902, + "grad_norm": 0.514038622379303, + "learning_rate": 6.991869918699188e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7186560034751892, + "num_tokens": 4183246.0, + "step": 305 + }, + { + "entropy": 0.9711487293243408, + "epoch": 0.7463414634146341, + "grad_norm": 0.45081955194473267, + "learning_rate": 6.978319783197832e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7125919461250305, + "num_tokens": 4196588.0, + "step": 306 + }, + { + "entropy": 0.9525703191757202, + "epoch": 0.748780487804878, + "grad_norm": 0.5370531678199768, + "learning_rate": 6.964769647696477e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7191635370254517, + "num_tokens": 4210281.0, + "step": 307 + }, + { + "entropy": 0.945855975151062, + "epoch": 0.751219512195122, + "grad_norm": 0.516020655632019, + "learning_rate": 6.951219512195122e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7142351865768433, + "num_tokens": 4224431.0, + "step": 308 + }, + { + "entropy": 0.9390691518783569, + "epoch": 0.7536585365853659, + "grad_norm": 0.5693097114562988, + "learning_rate": 6.937669376693768e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.71612149477005, + "num_tokens": 4238143.0, + "step": 309 + }, + { + "entropy": 0.9312563538551331, + "epoch": 0.7560975609756098, + "grad_norm": 0.5105025172233582, + "learning_rate": 6.924119241192413e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.715452253818512, + "num_tokens": 4252487.0, + "step": 310 + }, + { + "entropy": 0.9333822727203369, + "epoch": 0.7585365853658537, + "grad_norm": 0.4418947994709015, + "learning_rate": 6.910569105691057e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7165272831916809, + "num_tokens": 4266014.0, + "step": 311 + }, + { + "entropy": 0.9335235357284546, + "epoch": 0.7609756097560976, + "grad_norm": 0.4711912274360657, + "learning_rate": 6.8970189701897025e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7139325141906738, + "num_tokens": 4279782.0, + "step": 312 + }, + { + "entropy": 0.9249697327613831, + "epoch": 0.7634146341463415, + "grad_norm": 0.4940471351146698, + "learning_rate": 6.883468834688347e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7152402400970459, + "num_tokens": 4293866.0, + "step": 313 + }, + { + "entropy": 0.9464020729064941, + "epoch": 0.7658536585365854, + "grad_norm": 0.4517545700073242, + "learning_rate": 6.869918699186993e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7157577276229858, + "num_tokens": 4307469.0, + "step": 314 + }, + { + "entropy": 0.9276844263076782, + "epoch": 0.7682926829268293, + "grad_norm": 0.46661779284477234, + "learning_rate": 6.856368563685637e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.713527262210846, + "num_tokens": 4321235.0, + "step": 315 + }, + { + "entropy": 0.9301877617835999, + "epoch": 0.7707317073170732, + "grad_norm": 0.4871988296508789, + "learning_rate": 6.8428184281842825e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.718126118183136, + "num_tokens": 4335126.0, + "step": 316 + }, + { + "entropy": 0.9497619867324829, + "epoch": 0.7731707317073171, + "grad_norm": 0.4936281144618988, + "learning_rate": 6.829268292682928e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7093750238418579, + "num_tokens": 4348582.0, + "step": 317 + }, + { + "entropy": 0.9262542724609375, + "epoch": 0.775609756097561, + "grad_norm": 0.47619983553886414, + "learning_rate": 6.815718157181572e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7227988839149475, + "num_tokens": 4362375.0, + "step": 318 + }, + { + "entropy": 0.9480463266372681, + "epoch": 0.7780487804878049, + "grad_norm": 0.46537765860557556, + "learning_rate": 6.802168021680218e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7166230082511902, + "num_tokens": 4376143.0, + "step": 319 + }, + { + "entropy": 0.9295194745063782, + "epoch": 0.7804878048780488, + "grad_norm": 0.4315537214279175, + "learning_rate": 6.788617886178862e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7267556190490723, + "num_tokens": 4389872.0, + "step": 320 + }, + { + "entropy": 0.9453158378601074, + "epoch": 0.7829268292682927, + "grad_norm": 0.47282472252845764, + "learning_rate": 6.775067750677508e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7136433720588684, + "num_tokens": 4404122.0, + "step": 321 + }, + { + "entropy": 0.9408867359161377, + "epoch": 0.7853658536585366, + "grad_norm": 0.47678133845329285, + "learning_rate": 6.761517615176152e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7188437581062317, + "num_tokens": 4417803.0, + "step": 322 + }, + { + "entropy": 0.9601535797119141, + "epoch": 0.7878048780487805, + "grad_norm": 0.47788533568382263, + "learning_rate": 6.747967479674797e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7107544541358948, + "num_tokens": 4431776.0, + "step": 323 + }, + { + "entropy": 0.9345723986625671, + "epoch": 0.7902439024390244, + "grad_norm": 0.47635290026664734, + "learning_rate": 6.734417344173443e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7155382037162781, + "num_tokens": 4445365.0, + "step": 324 + }, + { + "entropy": 0.931740939617157, + "epoch": 0.7926829268292683, + "grad_norm": 0.4577711224555969, + "learning_rate": 6.7208672086720876e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7215069532394409, + "num_tokens": 4459051.0, + "step": 325 + }, + { + "entropy": 0.9348794221878052, + "epoch": 0.7951219512195122, + "grad_norm": 0.4833238124847412, + "learning_rate": 6.707317073170733e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7192904353141785, + "num_tokens": 4472540.0, + "step": 326 + }, + { + "entropy": 0.9263391494750977, + "epoch": 0.7975609756097561, + "grad_norm": 0.5299957394599915, + "learning_rate": 6.693766937669377e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7203274965286255, + "num_tokens": 4486601.0, + "step": 327 + }, + { + "entropy": 0.9487115144729614, + "epoch": 0.8, + "grad_norm": 0.531969428062439, + "learning_rate": 6.680216802168022e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7118086218833923, + "num_tokens": 4500285.0, + "step": 328 + }, + { + "entropy": 0.9246724843978882, + "epoch": 0.802439024390244, + "grad_norm": 0.47073879837989807, + "learning_rate": 6.666666666666667e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7152372002601624, + "num_tokens": 4513814.0, + "step": 329 + }, + { + "entropy": 0.9244076013565063, + "epoch": 0.8048780487804879, + "grad_norm": 0.5922364592552185, + "learning_rate": 6.653116531165313e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.718658447265625, + "num_tokens": 4527486.0, + "step": 330 + }, + { + "entropy": 0.9245865345001221, + "epoch": 0.8073170731707318, + "grad_norm": 0.48464804887771606, + "learning_rate": 6.639566395663957e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.718628466129303, + "num_tokens": 4541384.0, + "step": 331 + }, + { + "entropy": 0.9379637241363525, + "epoch": 0.8097560975609757, + "grad_norm": 0.5395787954330444, + "learning_rate": 6.626016260162602e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7203213572502136, + "num_tokens": 4554844.0, + "step": 332 + }, + { + "entropy": 0.9466052055358887, + "epoch": 0.8121951219512196, + "grad_norm": 0.48719871044158936, + "learning_rate": 6.6124661246612474e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7128361463546753, + "num_tokens": 4568953.0, + "step": 333 + }, + { + "entropy": 0.9478181600570679, + "epoch": 0.8146341463414634, + "grad_norm": 0.4469890892505646, + "learning_rate": 6.598915989159892e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.716035783290863, + "num_tokens": 4582601.0, + "step": 334 + }, + { + "entropy": 0.946060299873352, + "epoch": 0.8170731707317073, + "grad_norm": 0.4337994456291199, + "learning_rate": 6.585365853658538e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7160581946372986, + "num_tokens": 4596641.0, + "step": 335 + }, + { + "entropy": 0.9306113719940186, + "epoch": 0.8195121951219512, + "grad_norm": 0.4518454074859619, + "learning_rate": 6.571815718157182e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.724989116191864, + "num_tokens": 4610431.0, + "step": 336 + }, + { + "entropy": 0.9236544966697693, + "epoch": 0.8219512195121951, + "grad_norm": 0.5101668238639832, + "learning_rate": 6.558265582655827e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7232162952423096, + "num_tokens": 4623692.0, + "step": 337 + }, + { + "entropy": 0.9380990266799927, + "epoch": 0.824390243902439, + "grad_norm": 0.43267154693603516, + "learning_rate": 6.544715447154472e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7211065888404846, + "num_tokens": 4637552.0, + "step": 338 + }, + { + "entropy": 0.9355394840240479, + "epoch": 0.8268292682926829, + "grad_norm": 0.476979523897171, + "learning_rate": 6.531165311653117e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7181625366210938, + "num_tokens": 4651718.0, + "step": 339 + }, + { + "entropy": 0.9340901374816895, + "epoch": 0.8292682926829268, + "grad_norm": 0.4357958734035492, + "learning_rate": 6.517615176151762e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7212997674942017, + "num_tokens": 4665706.0, + "step": 340 + }, + { + "entropy": 0.948611855506897, + "epoch": 0.8317073170731707, + "grad_norm": 0.49279242753982544, + "learning_rate": 6.504065040650407e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7165062427520752, + "num_tokens": 4679747.0, + "step": 341 + }, + { + "entropy": 0.9157364964485168, + "epoch": 0.8341463414634146, + "grad_norm": 0.4604078531265259, + "learning_rate": 6.4905149051490525e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7201681733131409, + "num_tokens": 4693557.0, + "step": 342 + }, + { + "entropy": 0.9359651207923889, + "epoch": 0.8365853658536585, + "grad_norm": 0.5489866137504578, + "learning_rate": 6.476964769647697e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7123420238494873, + "num_tokens": 4707023.0, + "step": 343 + }, + { + "entropy": 0.9256577491760254, + "epoch": 0.8390243902439024, + "grad_norm": 0.47433802485466003, + "learning_rate": 6.463414634146342e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.716153085231781, + "num_tokens": 4720733.0, + "step": 344 + }, + { + "entropy": 0.9283899068832397, + "epoch": 0.8414634146341463, + "grad_norm": 0.42945265769958496, + "learning_rate": 6.449864498644986e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7218199372291565, + "num_tokens": 4734420.0, + "step": 345 + }, + { + "entropy": 0.9272925853729248, + "epoch": 0.8439024390243902, + "grad_norm": 0.5199980139732361, + "learning_rate": 6.436314363143632e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7235289812088013, + "num_tokens": 4748083.0, + "step": 346 + }, + { + "entropy": 0.9043445587158203, + "epoch": 0.8463414634146341, + "grad_norm": 0.535645604133606, + "learning_rate": 6.422764227642278e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7166420221328735, + "num_tokens": 4761619.0, + "step": 347 + }, + { + "entropy": 0.9312768578529358, + "epoch": 0.848780487804878, + "grad_norm": 0.480049729347229, + "learning_rate": 6.409214092140922e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.722160816192627, + "num_tokens": 4775204.0, + "step": 348 + }, + { + "entropy": 0.9094958305358887, + "epoch": 0.8512195121951219, + "grad_norm": 0.5510603785514832, + "learning_rate": 6.395663956639567e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7258016467094421, + "num_tokens": 4788630.0, + "step": 349 + }, + { + "entropy": 0.9357163906097412, + "epoch": 0.8536585365853658, + "grad_norm": 0.4671006500720978, + "learning_rate": 6.3821138211382115e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7199651002883911, + "num_tokens": 4802405.0, + "step": 350 + }, + { + "entropy": 0.911231517791748, + "epoch": 0.8560975609756097, + "grad_norm": 0.531291127204895, + "learning_rate": 6.368563685636857e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7275990843772888, + "num_tokens": 4816070.0, + "step": 351 + }, + { + "entropy": 0.9376004934310913, + "epoch": 0.8585365853658536, + "grad_norm": 0.5408096313476562, + "learning_rate": 6.355013550135501e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7203587293624878, + "num_tokens": 4829800.0, + "step": 352 + }, + { + "entropy": 0.9320589303970337, + "epoch": 0.8609756097560975, + "grad_norm": 0.4515257477760315, + "learning_rate": 6.341463414634147e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.719613790512085, + "num_tokens": 4843383.0, + "step": 353 + }, + { + "entropy": 0.9337896108627319, + "epoch": 0.8634146341463415, + "grad_norm": 0.5287736058235168, + "learning_rate": 6.3279132791327915e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.716210126876831, + "num_tokens": 4856761.0, + "step": 354 + }, + { + "entropy": 0.9203476309776306, + "epoch": 0.8658536585365854, + "grad_norm": 0.5867226719856262, + "learning_rate": 6.314363143631437e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.724420964717865, + "num_tokens": 4870421.0, + "step": 355 + }, + { + "entropy": 0.943367063999176, + "epoch": 0.8682926829268293, + "grad_norm": 0.4877321124076843, + "learning_rate": 6.300813008130082e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7234193086624146, + "num_tokens": 4884545.0, + "step": 356 + }, + { + "entropy": 0.9346873760223389, + "epoch": 0.8707317073170732, + "grad_norm": 0.465480238199234, + "learning_rate": 6.287262872628726e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7138851881027222, + "num_tokens": 4898115.0, + "step": 357 + }, + { + "entropy": 0.9186785221099854, + "epoch": 0.8731707317073171, + "grad_norm": 0.560099184513092, + "learning_rate": 6.273712737127372e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7197883129119873, + "num_tokens": 4911735.0, + "step": 358 + }, + { + "entropy": 0.9136354923248291, + "epoch": 0.875609756097561, + "grad_norm": 0.5416183471679688, + "learning_rate": 6.260162601626017e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7166678309440613, + "num_tokens": 4925964.0, + "step": 359 + }, + { + "entropy": 0.8963789343833923, + "epoch": 0.8780487804878049, + "grad_norm": 0.5411756038665771, + "learning_rate": 6.246612466124662e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.726434051990509, + "num_tokens": 4939421.0, + "step": 360 + }, + { + "entropy": 0.9152790307998657, + "epoch": 0.8804878048780488, + "grad_norm": 0.45827537775039673, + "learning_rate": 6.233062330623306e-06, + "loss": 0.924, + "mean_token_accuracy": 0.719045877456665, + "num_tokens": 4953272.0, + "step": 361 + }, + { + "entropy": 0.9214059114456177, + "epoch": 0.8829268292682927, + "grad_norm": 0.5113202333450317, + "learning_rate": 6.219512195121951e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7190465331077576, + "num_tokens": 4966671.0, + "step": 362 + }, + { + "entropy": 0.9017857313156128, + "epoch": 0.8853658536585366, + "grad_norm": 0.49038177728652954, + "learning_rate": 6.205962059620597e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7244783639907837, + "num_tokens": 4980058.0, + "step": 363 + }, + { + "entropy": 0.9247295260429382, + "epoch": 0.8878048780487805, + "grad_norm": 0.4466925859451294, + "learning_rate": 6.192411924119242e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7192449569702148, + "num_tokens": 4993901.0, + "step": 364 + }, + { + "entropy": 0.9221506118774414, + "epoch": 0.8902439024390244, + "grad_norm": 0.4737914502620697, + "learning_rate": 6.178861788617887e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7163258194923401, + "num_tokens": 5007852.0, + "step": 365 + }, + { + "entropy": 0.9240057468414307, + "epoch": 0.8926829268292683, + "grad_norm": 0.5152718424797058, + "learning_rate": 6.165311653116531e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7177215218544006, + "num_tokens": 5021298.0, + "step": 366 + }, + { + "entropy": 0.9130709171295166, + "epoch": 0.8951219512195122, + "grad_norm": 0.5038972496986389, + "learning_rate": 6.1517615176151765e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.726975679397583, + "num_tokens": 5034474.0, + "step": 367 + }, + { + "entropy": 0.9465330839157104, + "epoch": 0.8975609756097561, + "grad_norm": 0.48436981439590454, + "learning_rate": 6.138211382113821e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7148197889328003, + "num_tokens": 5047864.0, + "step": 368 + }, + { + "entropy": 0.9248801469802856, + "epoch": 0.9, + "grad_norm": 0.523639440536499, + "learning_rate": 6.124661246612467e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7153012156486511, + "num_tokens": 5061807.0, + "step": 369 + }, + { + "entropy": 0.9198777675628662, + "epoch": 0.9024390243902439, + "grad_norm": 0.4940233528614044, + "learning_rate": 6.111111111111112e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.73076331615448, + "num_tokens": 5074871.0, + "step": 370 + }, + { + "entropy": 0.9299763441085815, + "epoch": 0.9048780487804878, + "grad_norm": 0.5024548768997192, + "learning_rate": 6.0975609756097564e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7201763987541199, + "num_tokens": 5088492.0, + "step": 371 + }, + { + "entropy": 0.9322130680084229, + "epoch": 0.9073170731707317, + "grad_norm": 0.4858999252319336, + "learning_rate": 6.084010840108402e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7197042107582092, + "num_tokens": 5102165.0, + "step": 372 + }, + { + "entropy": 0.9305065870285034, + "epoch": 0.9097560975609756, + "grad_norm": 0.5411167144775391, + "learning_rate": 6.070460704607046e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7243618369102478, + "num_tokens": 5115735.0, + "step": 373 + }, + { + "entropy": 0.9265807867050171, + "epoch": 0.9121951219512195, + "grad_norm": 0.5147826075553894, + "learning_rate": 6.056910569105692e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7187079191207886, + "num_tokens": 5129125.0, + "step": 374 + }, + { + "entropy": 0.9361748695373535, + "epoch": 0.9146341463414634, + "grad_norm": 0.4886268079280853, + "learning_rate": 6.043360433604336e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7172378897666931, + "num_tokens": 5142884.0, + "step": 375 + }, + { + "entropy": 0.9086503386497498, + "epoch": 0.9170731707317074, + "grad_norm": 0.5013159513473511, + "learning_rate": 6.0298102981029816e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7264388203620911, + "num_tokens": 5156418.0, + "step": 376 + }, + { + "entropy": 0.8983087539672852, + "epoch": 0.9195121951219513, + "grad_norm": 0.5629090070724487, + "learning_rate": 6.016260162601627e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7258298397064209, + "num_tokens": 5170232.0, + "step": 377 + }, + { + "entropy": 0.9251675605773926, + "epoch": 0.9219512195121952, + "grad_norm": 0.6035507321357727, + "learning_rate": 6.002710027100271e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7216352224349976, + "num_tokens": 5183971.0, + "step": 378 + }, + { + "entropy": 0.9210293292999268, + "epoch": 0.9243902439024391, + "grad_norm": 0.46510669589042664, + "learning_rate": 5.989159891598917e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7173804044723511, + "num_tokens": 5197928.0, + "step": 379 + }, + { + "entropy": 0.9165781140327454, + "epoch": 0.926829268292683, + "grad_norm": 0.49598658084869385, + "learning_rate": 5.9756097560975615e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7150620818138123, + "num_tokens": 5211561.0, + "step": 380 + }, + { + "entropy": 0.9051110744476318, + "epoch": 0.9292682926829269, + "grad_norm": 0.5174202919006348, + "learning_rate": 5.962059620596207e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7223758697509766, + "num_tokens": 5225315.0, + "step": 381 + }, + { + "entropy": 0.8991729021072388, + "epoch": 0.9317073170731708, + "grad_norm": 0.5201939344406128, + "learning_rate": 5.948509485094851e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7234763503074646, + "num_tokens": 5239196.0, + "step": 382 + }, + { + "entropy": 0.9231674671173096, + "epoch": 0.9341463414634147, + "grad_norm": 0.6580784916877747, + "learning_rate": 5.934959349593496e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.716602087020874, + "num_tokens": 5253150.0, + "step": 383 + }, + { + "entropy": 0.9015557169914246, + "epoch": 0.9365853658536586, + "grad_norm": 0.6364657878875732, + "learning_rate": 5.921409214092141e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7272793650627136, + "num_tokens": 5266887.0, + "step": 384 + }, + { + "entropy": 0.9314663410186768, + "epoch": 0.9390243902439024, + "grad_norm": 0.4809994399547577, + "learning_rate": 5.907859078590787e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7218727469444275, + "num_tokens": 5280893.0, + "step": 385 + }, + { + "entropy": 0.9162091612815857, + "epoch": 0.9414634146341463, + "grad_norm": 0.4761136472225189, + "learning_rate": 5.894308943089432e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7201241850852966, + "num_tokens": 5294758.0, + "step": 386 + }, + { + "entropy": 0.937978982925415, + "epoch": 0.9439024390243902, + "grad_norm": 0.5258710384368896, + "learning_rate": 5.880758807588076e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7101801633834839, + "num_tokens": 5308762.0, + "step": 387 + }, + { + "entropy": 0.9138582348823547, + "epoch": 0.9463414634146341, + "grad_norm": 0.5211506485939026, + "learning_rate": 5.867208672086721e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7287028431892395, + "num_tokens": 5323017.0, + "step": 388 + }, + { + "entropy": 0.9176620244979858, + "epoch": 0.948780487804878, + "grad_norm": 0.4472411572933197, + "learning_rate": 5.853658536585366e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7202617526054382, + "num_tokens": 5336635.0, + "step": 389 + }, + { + "entropy": 0.9260638952255249, + "epoch": 0.9512195121951219, + "grad_norm": 0.5161044597625732, + "learning_rate": 5.840108401084012e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7174168825149536, + "num_tokens": 5350247.0, + "step": 390 + }, + { + "entropy": 0.9278532862663269, + "epoch": 0.9536585365853658, + "grad_norm": 0.462639719247818, + "learning_rate": 5.826558265582656e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7207022905349731, + "num_tokens": 5364445.0, + "step": 391 + }, + { + "entropy": 0.9160816669464111, + "epoch": 0.9560975609756097, + "grad_norm": 0.4721212387084961, + "learning_rate": 5.813008130081301e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7259811758995056, + "num_tokens": 5378398.0, + "step": 392 + }, + { + "entropy": 0.9266201257705688, + "epoch": 0.9585365853658536, + "grad_norm": 0.5438399314880371, + "learning_rate": 5.7994579945799465e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7273541688919067, + "num_tokens": 5391805.0, + "step": 393 + }, + { + "entropy": 0.925537645816803, + "epoch": 0.9609756097560975, + "grad_norm": 0.47833213210105896, + "learning_rate": 5.785907859078591e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7158151268959045, + "num_tokens": 5405365.0, + "step": 394 + }, + { + "entropy": 0.921244204044342, + "epoch": 0.9634146341463414, + "grad_norm": 0.5551829934120178, + "learning_rate": 5.772357723577237e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7213079333305359, + "num_tokens": 5419174.0, + "step": 395 + }, + { + "entropy": 0.924101710319519, + "epoch": 0.9658536585365853, + "grad_norm": 0.4825878441333771, + "learning_rate": 5.758807588075881e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7207282185554504, + "num_tokens": 5432428.0, + "step": 396 + }, + { + "entropy": 0.9335201978683472, + "epoch": 0.9682926829268292, + "grad_norm": 0.46904629468917847, + "learning_rate": 5.7452574525745265e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7109548449516296, + "num_tokens": 5445954.0, + "step": 397 + }, + { + "entropy": 0.9025465250015259, + "epoch": 0.9707317073170731, + "grad_norm": 0.5336489081382751, + "learning_rate": 5.731707317073171e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7340313196182251, + "num_tokens": 5459246.0, + "step": 398 + }, + { + "entropy": 0.921517014503479, + "epoch": 0.973170731707317, + "grad_norm": 0.4557330012321472, + "learning_rate": 5.718157181571816e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7241076827049255, + "num_tokens": 5472934.0, + "step": 399 + }, + { + "entropy": 0.9142166972160339, + "epoch": 0.975609756097561, + "grad_norm": 0.48192092776298523, + "learning_rate": 5.704607046070462e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7226113677024841, + "num_tokens": 5486797.0, + "step": 400 + }, + { + "entropy": 0.9260218143463135, + "epoch": 0.9780487804878049, + "grad_norm": 0.44798484444618225, + "learning_rate": 5.691056910569106e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7207956314086914, + "num_tokens": 5500638.0, + "step": 401 + }, + { + "entropy": 0.9152418375015259, + "epoch": 0.9804878048780488, + "grad_norm": 0.43449974060058594, + "learning_rate": 5.677506775067752e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7272465229034424, + "num_tokens": 5514542.0, + "step": 402 + }, + { + "entropy": 0.9501792788505554, + "epoch": 0.9829268292682927, + "grad_norm": 0.47718504071235657, + "learning_rate": 5.663956639566396e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7121170163154602, + "num_tokens": 5528918.0, + "step": 403 + }, + { + "entropy": 0.9009894728660583, + "epoch": 0.9853658536585366, + "grad_norm": 0.5538046360015869, + "learning_rate": 5.650406504065041e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7224763631820679, + "num_tokens": 5542486.0, + "step": 404 + }, + { + "entropy": 0.8964865207672119, + "epoch": 0.9878048780487805, + "grad_norm": 0.47334983944892883, + "learning_rate": 5.6368563685636855e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7234845757484436, + "num_tokens": 5556541.0, + "step": 405 + }, + { + "entropy": 0.9112534523010254, + "epoch": 0.9902439024390244, + "grad_norm": 0.5056594610214233, + "learning_rate": 5.6233062330623315e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7229964137077332, + "num_tokens": 5570120.0, + "step": 406 + }, + { + "entropy": 0.9020826816558838, + "epoch": 0.9926829268292683, + "grad_norm": 0.45458662509918213, + "learning_rate": 5.609756097560977e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7202485203742981, + "num_tokens": 5583816.0, + "step": 407 + }, + { + "entropy": 0.9063427448272705, + "epoch": 0.9951219512195122, + "grad_norm": 0.44870609045028687, + "learning_rate": 5.596205962059621e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7196942567825317, + "num_tokens": 5597699.0, + "step": 408 + }, + { + "entropy": 0.9229121208190918, + "epoch": 0.9975609756097561, + "grad_norm": 0.47767025232315063, + "learning_rate": 5.582655826558266e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7199122905731201, + "num_tokens": 5611857.0, + "step": 409 + }, + { + "entropy": 0.9124419093132019, + "epoch": 1.0, + "grad_norm": 0.5221594572067261, + "learning_rate": 5.569105691056911e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7252015471458435, + "num_tokens": 5625392.0, + "step": 410 + }, + { + "entropy": 0.9091067314147949, + "epoch": 1.002439024390244, + "grad_norm": 0.46902021765708923, + "learning_rate": 5.555555555555557e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7198294401168823, + "num_tokens": 5639478.0, + "step": 411 + }, + { + "entropy": 0.8968576192855835, + "epoch": 1.0048780487804878, + "grad_norm": 0.4639955759048462, + "learning_rate": 5.542005420054201e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7282954454421997, + "num_tokens": 5653005.0, + "step": 412 + }, + { + "entropy": 0.9134030342102051, + "epoch": 1.0073170731707317, + "grad_norm": 0.539460301399231, + "learning_rate": 5.528455284552846e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7240451574325562, + "num_tokens": 5666400.0, + "step": 413 + }, + { + "entropy": 0.9106577634811401, + "epoch": 1.0097560975609756, + "grad_norm": 0.5035801529884338, + "learning_rate": 5.5149051490514906e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7225700616836548, + "num_tokens": 5679832.0, + "step": 414 + }, + { + "entropy": 0.9107507467269897, + "epoch": 1.0121951219512195, + "grad_norm": 0.48390769958496094, + "learning_rate": 5.501355013550136e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7192770838737488, + "num_tokens": 5693958.0, + "step": 415 + }, + { + "entropy": 0.9108355045318604, + "epoch": 1.0146341463414634, + "grad_norm": 0.5258286595344543, + "learning_rate": 5.487804878048781e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7195926308631897, + "num_tokens": 5707622.0, + "step": 416 + }, + { + "entropy": 0.9191775918006897, + "epoch": 1.0170731707317073, + "grad_norm": 0.5931156277656555, + "learning_rate": 5.474254742547425e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7233299612998962, + "num_tokens": 5721380.0, + "step": 417 + }, + { + "entropy": 0.9231445789337158, + "epoch": 1.0195121951219512, + "grad_norm": 0.5037340521812439, + "learning_rate": 5.460704607046071e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7243925929069519, + "num_tokens": 5734937.0, + "step": 418 + }, + { + "entropy": 0.9194551110267639, + "epoch": 1.0219512195121951, + "grad_norm": 0.4648572504520416, + "learning_rate": 5.447154471544716e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.721906304359436, + "num_tokens": 5748319.0, + "step": 419 + }, + { + "entropy": 0.9239892959594727, + "epoch": 1.024390243902439, + "grad_norm": 0.49774280190467834, + "learning_rate": 5.433604336043361e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7180191278457642, + "num_tokens": 5761804.0, + "step": 420 + }, + { + "entropy": 0.9286030530929565, + "epoch": 1.026829268292683, + "grad_norm": 0.526913046836853, + "learning_rate": 5.420054200542005e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7196871638298035, + "num_tokens": 5775758.0, + "step": 421 + }, + { + "entropy": 0.9007755517959595, + "epoch": 1.0292682926829269, + "grad_norm": 0.4655540883541107, + "learning_rate": 5.4065040650406504e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7245073914527893, + "num_tokens": 5789680.0, + "step": 422 + }, + { + "entropy": 0.9167541265487671, + "epoch": 1.0317073170731708, + "grad_norm": 0.49581092596054077, + "learning_rate": 5.3929539295392965e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7206687927246094, + "num_tokens": 5803751.0, + "step": 423 + }, + { + "entropy": 0.9178483486175537, + "epoch": 1.0341463414634147, + "grad_norm": 0.5530057549476624, + "learning_rate": 5.379403794037941e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7275153994560242, + "num_tokens": 5817254.0, + "step": 424 + }, + { + "entropy": 0.9028128385543823, + "epoch": 1.0365853658536586, + "grad_norm": 0.5000537633895874, + "learning_rate": 5.365853658536586e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7256110906600952, + "num_tokens": 5830893.0, + "step": 425 + }, + { + "entropy": 0.9217618107795715, + "epoch": 1.0390243902439025, + "grad_norm": 0.49565988779067993, + "learning_rate": 5.35230352303523e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7196056842803955, + "num_tokens": 5844604.0, + "step": 426 + }, + { + "entropy": 0.9046823978424072, + "epoch": 1.0414634146341464, + "grad_norm": 0.47147202491760254, + "learning_rate": 5.338753387533876e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.724524974822998, + "num_tokens": 5858251.0, + "step": 427 + }, + { + "entropy": 0.8891482353210449, + "epoch": 1.0439024390243903, + "grad_norm": 0.5021435618400574, + "learning_rate": 5.32520325203252e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7333284020423889, + "num_tokens": 5871853.0, + "step": 428 + }, + { + "entropy": 0.9102505445480347, + "epoch": 1.0463414634146342, + "grad_norm": 0.7069030404090881, + "learning_rate": 5.311653116531166e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7191069722175598, + "num_tokens": 5885351.0, + "step": 429 + }, + { + "entropy": 0.8940747976303101, + "epoch": 1.048780487804878, + "grad_norm": 0.4877447783946991, + "learning_rate": 5.298102981029811e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7237207293510437, + "num_tokens": 5899418.0, + "step": 430 + }, + { + "entropy": 0.8975793123245239, + "epoch": 1.051219512195122, + "grad_norm": 0.49886640906333923, + "learning_rate": 5.2845528455284555e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7229393720626831, + "num_tokens": 5913301.0, + "step": 431 + }, + { + "entropy": 0.9058709144592285, + "epoch": 1.053658536585366, + "grad_norm": 0.5046048760414124, + "learning_rate": 5.271002710027101e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.727680504322052, + "num_tokens": 5927139.0, + "step": 432 + }, + { + "entropy": 0.8988399505615234, + "epoch": 1.0560975609756098, + "grad_norm": 0.5127994418144226, + "learning_rate": 5.257452574525745e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7257552742958069, + "num_tokens": 5940395.0, + "step": 433 + }, + { + "entropy": 0.9131532907485962, + "epoch": 1.0585365853658537, + "grad_norm": 0.5204398036003113, + "learning_rate": 5.243902439024391e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7254641652107239, + "num_tokens": 5953983.0, + "step": 434 + }, + { + "entropy": 0.9116688966751099, + "epoch": 1.0609756097560976, + "grad_norm": 0.5195981860160828, + "learning_rate": 5.2303523035230355e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7243059873580933, + "num_tokens": 5967543.0, + "step": 435 + }, + { + "entropy": 0.8945907354354858, + "epoch": 1.0634146341463415, + "grad_norm": 0.4468877613544464, + "learning_rate": 5.216802168021681e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7255864143371582, + "num_tokens": 5981414.0, + "step": 436 + }, + { + "entropy": 0.914824366569519, + "epoch": 1.0658536585365854, + "grad_norm": 0.4595797657966614, + "learning_rate": 5.203252032520326e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7230360507965088, + "num_tokens": 5995356.0, + "step": 437 + }, + { + "entropy": 0.9173953533172607, + "epoch": 1.0682926829268293, + "grad_norm": 0.5476388931274414, + "learning_rate": 5.18970189701897e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7220655083656311, + "num_tokens": 6009199.0, + "step": 438 + }, + { + "entropy": 0.9245563745498657, + "epoch": 1.0707317073170732, + "grad_norm": 0.5139727592468262, + "learning_rate": 5.176151761517616e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7199491262435913, + "num_tokens": 6023366.0, + "step": 439 + }, + { + "entropy": 0.8988422751426697, + "epoch": 1.0731707317073171, + "grad_norm": 0.5930190682411194, + "learning_rate": 5.162601626016261e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7224959135055542, + "num_tokens": 6036780.0, + "step": 440 + }, + { + "entropy": 0.9217547178268433, + "epoch": 1.075609756097561, + "grad_norm": 0.534253716468811, + "learning_rate": 5.149051490514906e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7249597907066345, + "num_tokens": 6050474.0, + "step": 441 + }, + { + "entropy": 0.9051541090011597, + "epoch": 1.078048780487805, + "grad_norm": 0.5235859751701355, + "learning_rate": 5.13550135501355e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7299248576164246, + "num_tokens": 6063927.0, + "step": 442 + }, + { + "entropy": 0.9091707468032837, + "epoch": 1.0804878048780489, + "grad_norm": 0.5509324669837952, + "learning_rate": 5.121951219512195e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7241198420524597, + "num_tokens": 6077293.0, + "step": 443 + }, + { + "entropy": 0.9139906167984009, + "epoch": 1.0829268292682928, + "grad_norm": 0.4984675943851471, + "learning_rate": 5.10840108401084e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7256140112876892, + "num_tokens": 6091559.0, + "step": 444 + }, + { + "entropy": 0.8962512016296387, + "epoch": 1.0853658536585367, + "grad_norm": 0.6049586534500122, + "learning_rate": 5.094850948509486e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7295030951499939, + "num_tokens": 6106004.0, + "step": 445 + }, + { + "entropy": 0.9080502986907959, + "epoch": 1.0878048780487806, + "grad_norm": 0.539002001285553, + "learning_rate": 5.081300813008131e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7214069366455078, + "num_tokens": 6119240.0, + "step": 446 + }, + { + "entropy": 0.9243647456169128, + "epoch": 1.0902439024390245, + "grad_norm": 0.5728839635848999, + "learning_rate": 5.067750677506775e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7210649251937866, + "num_tokens": 6133417.0, + "step": 447 + }, + { + "entropy": 0.9063730835914612, + "epoch": 1.0926829268292684, + "grad_norm": 0.5486031174659729, + "learning_rate": 5.0542005420054205e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7223870158195496, + "num_tokens": 6147593.0, + "step": 448 + }, + { + "entropy": 0.9127067923545837, + "epoch": 1.0951219512195123, + "grad_norm": 0.5353381037712097, + "learning_rate": 5.040650406504065e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7239224314689636, + "num_tokens": 6161529.0, + "step": 449 + }, + { + "entropy": 0.900060772895813, + "epoch": 1.0975609756097562, + "grad_norm": 0.49772390723228455, + "learning_rate": 5.027100271002711e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7243174910545349, + "num_tokens": 6175757.0, + "step": 450 + }, + { + "entropy": 0.9074562788009644, + "epoch": 1.1, + "grad_norm": 0.47336098551750183, + "learning_rate": 5.013550135501355e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7258216142654419, + "num_tokens": 6189618.0, + "step": 451 + }, + { + "entropy": 0.9215185642242432, + "epoch": 1.102439024390244, + "grad_norm": 0.49442532658576965, + "learning_rate": 5e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7188286781311035, + "num_tokens": 6203533.0, + "step": 452 + }, + { + "entropy": 0.9018423557281494, + "epoch": 1.104878048780488, + "grad_norm": 0.5464769601821899, + "learning_rate": 4.986449864498646e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7272529602050781, + "num_tokens": 6217342.0, + "step": 453 + }, + { + "entropy": 0.9130271673202515, + "epoch": 1.1073170731707318, + "grad_norm": 0.4416399896144867, + "learning_rate": 4.97289972899729e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7259694933891296, + "num_tokens": 6231309.0, + "step": 454 + }, + { + "entropy": 0.8990261554718018, + "epoch": 1.1097560975609757, + "grad_norm": 0.4484340250492096, + "learning_rate": 4.959349593495935e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7287976741790771, + "num_tokens": 6245215.0, + "step": 455 + }, + { + "entropy": 0.8985174298286438, + "epoch": 1.1121951219512196, + "grad_norm": 0.47807013988494873, + "learning_rate": 4.94579945799458e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7291353344917297, + "num_tokens": 6259190.0, + "step": 456 + }, + { + "entropy": 0.8863782286643982, + "epoch": 1.1146341463414635, + "grad_norm": 0.44421708583831787, + "learning_rate": 4.9322493224932255e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7294890880584717, + "num_tokens": 6273357.0, + "step": 457 + }, + { + "entropy": 0.8936686515808105, + "epoch": 1.1170731707317074, + "grad_norm": 0.5020765066146851, + "learning_rate": 4.918699186991871e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7257601618766785, + "num_tokens": 6286956.0, + "step": 458 + }, + { + "entropy": 0.8936822414398193, + "epoch": 1.1195121951219513, + "grad_norm": 0.4957248568534851, + "learning_rate": 4.905149051490515e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7259597778320312, + "num_tokens": 6300751.0, + "step": 459 + }, + { + "entropy": 0.8922320604324341, + "epoch": 1.1219512195121952, + "grad_norm": 0.5009384155273438, + "learning_rate": 4.89159891598916e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7239802479743958, + "num_tokens": 6314545.0, + "step": 460 + }, + { + "entropy": 0.9002817869186401, + "epoch": 1.1243902439024391, + "grad_norm": 0.5000170469284058, + "learning_rate": 4.8780487804878055e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7265538573265076, + "num_tokens": 6328092.0, + "step": 461 + }, + { + "entropy": 0.8917896747589111, + "epoch": 1.126829268292683, + "grad_norm": 0.4810425639152527, + "learning_rate": 4.86449864498645e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7309262752532959, + "num_tokens": 6342067.0, + "step": 462 + }, + { + "entropy": 0.9265782833099365, + "epoch": 1.129268292682927, + "grad_norm": 0.474269300699234, + "learning_rate": 4.850948509485095e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7133171558380127, + "num_tokens": 6356095.0, + "step": 463 + }, + { + "entropy": 0.9106369018554688, + "epoch": 1.1317073170731708, + "grad_norm": 0.49770355224609375, + "learning_rate": 4.83739837398374e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7211055159568787, + "num_tokens": 6369643.0, + "step": 464 + }, + { + "entropy": 0.8969599008560181, + "epoch": 1.1341463414634148, + "grad_norm": 0.47881314158439636, + "learning_rate": 4.823848238482385e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7260048985481262, + "num_tokens": 6383068.0, + "step": 465 + }, + { + "entropy": 0.9027596712112427, + "epoch": 1.1365853658536587, + "grad_norm": 0.5635483264923096, + "learning_rate": 4.810298102981031e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7189622521400452, + "num_tokens": 6396922.0, + "step": 466 + }, + { + "entropy": 0.8914520740509033, + "epoch": 1.1390243902439026, + "grad_norm": 0.5290432572364807, + "learning_rate": 4.796747967479675e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7268030047416687, + "num_tokens": 6410873.0, + "step": 467 + }, + { + "entropy": 0.8867425918579102, + "epoch": 1.1414634146341462, + "grad_norm": 0.48660776019096375, + "learning_rate": 4.78319783197832e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.729336678981781, + "num_tokens": 6424367.0, + "step": 468 + }, + { + "entropy": 0.9146620035171509, + "epoch": 1.1439024390243901, + "grad_norm": 0.5063614249229431, + "learning_rate": 4.769647696476965e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.724082350730896, + "num_tokens": 6438032.0, + "step": 469 + }, + { + "entropy": 0.9144189357757568, + "epoch": 1.146341463414634, + "grad_norm": 0.5035169124603271, + "learning_rate": 4.75609756097561e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7198021411895752, + "num_tokens": 6451592.0, + "step": 470 + }, + { + "entropy": 0.9157655835151672, + "epoch": 1.148780487804878, + "grad_norm": 0.5481422543525696, + "learning_rate": 4.742547425474256e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7282554507255554, + "num_tokens": 6465577.0, + "step": 471 + }, + { + "entropy": 0.8913492560386658, + "epoch": 1.1512195121951219, + "grad_norm": 0.5407872796058655, + "learning_rate": 4.7289972899729e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7290219068527222, + "num_tokens": 6478690.0, + "step": 472 + }, + { + "entropy": 0.9125957489013672, + "epoch": 1.1536585365853658, + "grad_norm": 0.4850672483444214, + "learning_rate": 4.715447154471545e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7230581641197205, + "num_tokens": 6492662.0, + "step": 473 + }, + { + "entropy": 0.9092464447021484, + "epoch": 1.1560975609756097, + "grad_norm": 0.4803924560546875, + "learning_rate": 4.7018970189701905e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7263503670692444, + "num_tokens": 6506082.0, + "step": 474 + }, + { + "entropy": 0.9019302129745483, + "epoch": 1.1585365853658536, + "grad_norm": 0.4694143533706665, + "learning_rate": 4.688346883468835e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7246914505958557, + "num_tokens": 6520115.0, + "step": 475 + }, + { + "entropy": 0.9142214059829712, + "epoch": 1.1609756097560975, + "grad_norm": 0.5448134541511536, + "learning_rate": 4.67479674796748e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7227379679679871, + "num_tokens": 6533703.0, + "step": 476 + }, + { + "entropy": 0.8932230472564697, + "epoch": 1.1634146341463414, + "grad_norm": 0.7330875992774963, + "learning_rate": 4.661246612466125e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7294765114784241, + "num_tokens": 6547167.0, + "step": 477 + }, + { + "entropy": 0.8862720727920532, + "epoch": 1.1658536585365853, + "grad_norm": 0.5509917736053467, + "learning_rate": 4.64769647696477e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7343223094940186, + "num_tokens": 6560530.0, + "step": 478 + }, + { + "entropy": 0.8873111009597778, + "epoch": 1.1682926829268292, + "grad_norm": 0.4889533817768097, + "learning_rate": 4.634146341463416e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.731568455696106, + "num_tokens": 6574069.0, + "step": 479 + }, + { + "entropy": 0.8986578583717346, + "epoch": 1.170731707317073, + "grad_norm": 0.4992435872554779, + "learning_rate": 4.62059620596206e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.726729154586792, + "num_tokens": 6587632.0, + "step": 480 + }, + { + "entropy": 0.8890163898468018, + "epoch": 1.173170731707317, + "grad_norm": 0.5475744009017944, + "learning_rate": 4.607046070460705e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7279131412506104, + "num_tokens": 6601276.0, + "step": 481 + }, + { + "entropy": 0.8914072513580322, + "epoch": 1.175609756097561, + "grad_norm": 0.49075618386268616, + "learning_rate": 4.59349593495935e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7293311357498169, + "num_tokens": 6614718.0, + "step": 482 + }, + { + "entropy": 0.9068721532821655, + "epoch": 1.1780487804878048, + "grad_norm": 0.5013757348060608, + "learning_rate": 4.579945799457995e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7248654365539551, + "num_tokens": 6628480.0, + "step": 483 + }, + { + "entropy": 0.8967253565788269, + "epoch": 1.1804878048780487, + "grad_norm": 0.450345516204834, + "learning_rate": 4.56639566395664e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7259194850921631, + "num_tokens": 6642335.0, + "step": 484 + }, + { + "entropy": 0.9039151668548584, + "epoch": 1.1829268292682926, + "grad_norm": 0.5018711090087891, + "learning_rate": 4.552845528455285e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7184143662452698, + "num_tokens": 6656503.0, + "step": 485 + }, + { + "entropy": 0.8783119916915894, + "epoch": 1.1853658536585365, + "grad_norm": 0.4550807476043701, + "learning_rate": 4.53929539295393e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7290191650390625, + "num_tokens": 6670782.0, + "step": 486 + }, + { + "entropy": 0.8858023881912231, + "epoch": 1.1878048780487804, + "grad_norm": 0.5283997058868408, + "learning_rate": 4.5257452574525755e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7234495282173157, + "num_tokens": 6684423.0, + "step": 487 + }, + { + "entropy": 0.9109866619110107, + "epoch": 1.1902439024390243, + "grad_norm": 0.4794978201389313, + "learning_rate": 4.51219512195122e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7165172100067139, + "num_tokens": 6698267.0, + "step": 488 + }, + { + "entropy": 0.9090756177902222, + "epoch": 1.1926829268292682, + "grad_norm": 0.5581902861595154, + "learning_rate": 4.498644986449865e-06, + "loss": 0.889, + "mean_token_accuracy": 0.728282630443573, + "num_tokens": 6711786.0, + "step": 489 + }, + { + "entropy": 0.8913673758506775, + "epoch": 1.1951219512195121, + "grad_norm": 0.5424763560295105, + "learning_rate": 4.485094850948509e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7309855818748474, + "num_tokens": 6725489.0, + "step": 490 + }, + { + "entropy": 0.9181253910064697, + "epoch": 1.197560975609756, + "grad_norm": 0.4987359941005707, + "learning_rate": 4.471544715447155e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7176190614700317, + "num_tokens": 6738962.0, + "step": 491 + }, + { + "entropy": 0.8765039443969727, + "epoch": 1.2, + "grad_norm": 0.4425133764743805, + "learning_rate": 4.4579945799458e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.732286810874939, + "num_tokens": 6752993.0, + "step": 492 + }, + { + "entropy": 0.9064300060272217, + "epoch": 1.2024390243902439, + "grad_norm": 0.5201969146728516, + "learning_rate": 4.444444444444444e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7229897379875183, + "num_tokens": 6767124.0, + "step": 493 + }, + { + "entropy": 0.9056813716888428, + "epoch": 1.2048780487804878, + "grad_norm": 0.6040856242179871, + "learning_rate": 4.43089430894309e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7237012386322021, + "num_tokens": 6781038.0, + "step": 494 + }, + { + "entropy": 0.8948585987091064, + "epoch": 1.2073170731707317, + "grad_norm": 0.4676818251609802, + "learning_rate": 4.4173441734417345e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7293363213539124, + "num_tokens": 6794931.0, + "step": 495 + }, + { + "entropy": 0.9097365736961365, + "epoch": 1.2097560975609756, + "grad_norm": 0.506108820438385, + "learning_rate": 4.40379403794038e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7242258191108704, + "num_tokens": 6808284.0, + "step": 496 + }, + { + "entropy": 0.8956904411315918, + "epoch": 1.2121951219512195, + "grad_norm": 0.4974040389060974, + "learning_rate": 4.390243902439025e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7297356724739075, + "num_tokens": 6821994.0, + "step": 497 + }, + { + "entropy": 0.899440348148346, + "epoch": 1.2146341463414634, + "grad_norm": 0.5107272267341614, + "learning_rate": 4.376693766937669e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7293535470962524, + "num_tokens": 6835947.0, + "step": 498 + }, + { + "entropy": 0.9155231714248657, + "epoch": 1.2170731707317073, + "grad_norm": 0.5254876017570496, + "learning_rate": 4.3631436314363145e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7245771884918213, + "num_tokens": 6849386.0, + "step": 499 + }, + { + "entropy": 0.9062044620513916, + "epoch": 1.2195121951219512, + "grad_norm": 0.4622846841812134, + "learning_rate": 4.34959349593496e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7278857827186584, + "num_tokens": 6863341.0, + "step": 500 + }, + { + "entropy": 0.9044604301452637, + "epoch": 1.221951219512195, + "grad_norm": 0.5532169342041016, + "learning_rate": 4.336043360433605e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7176086902618408, + "num_tokens": 6877157.0, + "step": 501 + }, + { + "entropy": 0.8994400501251221, + "epoch": 1.224390243902439, + "grad_norm": 0.5434163212776184, + "learning_rate": 4.32249322493225e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7263439893722534, + "num_tokens": 6890975.0, + "step": 502 + }, + { + "entropy": 0.9025291204452515, + "epoch": 1.226829268292683, + "grad_norm": 0.4925203323364258, + "learning_rate": 4.308943089430894e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7318164706230164, + "num_tokens": 6904176.0, + "step": 503 + }, + { + "entropy": 0.9300906658172607, + "epoch": 1.2292682926829268, + "grad_norm": 0.5197744369506836, + "learning_rate": 4.29539295392954e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7121908664703369, + "num_tokens": 6917899.0, + "step": 504 + }, + { + "entropy": 0.9055182933807373, + "epoch": 1.2317073170731707, + "grad_norm": 0.4847884476184845, + "learning_rate": 4.281842818428185e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.722909688949585, + "num_tokens": 6931896.0, + "step": 505 + }, + { + "entropy": 0.9060989618301392, + "epoch": 1.2341463414634146, + "grad_norm": 0.4853264093399048, + "learning_rate": 4.268292682926829e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7221659421920776, + "num_tokens": 6945726.0, + "step": 506 + }, + { + "entropy": 0.8736509084701538, + "epoch": 1.2365853658536585, + "grad_norm": 0.48868390917778015, + "learning_rate": 4.254742547425474e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7347749471664429, + "num_tokens": 6959338.0, + "step": 507 + }, + { + "entropy": 0.925621747970581, + "epoch": 1.2390243902439024, + "grad_norm": 0.5540333390235901, + "learning_rate": 4.2411924119241196e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7208184003829956, + "num_tokens": 6973284.0, + "step": 508 + }, + { + "entropy": 0.911636233329773, + "epoch": 1.2414634146341463, + "grad_norm": 0.5570753812789917, + "learning_rate": 4.227642276422765e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7139846086502075, + "num_tokens": 6987058.0, + "step": 509 + }, + { + "entropy": 0.9004988670349121, + "epoch": 1.2439024390243902, + "grad_norm": 0.5094306468963623, + "learning_rate": 4.21409214092141e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.724961519241333, + "num_tokens": 7000723.0, + "step": 510 + }, + { + "entropy": 0.8960776329040527, + "epoch": 1.2463414634146341, + "grad_norm": 0.5077755451202393, + "learning_rate": 4.200542005420054e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7270680665969849, + "num_tokens": 7014508.0, + "step": 511 + }, + { + "entropy": 0.9104132056236267, + "epoch": 1.248780487804878, + "grad_norm": 0.48007309436798096, + "learning_rate": 4.1869918699186995e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7218325138092041, + "num_tokens": 7028494.0, + "step": 512 + }, + { + "entropy": 0.8949484825134277, + "epoch": 1.251219512195122, + "grad_norm": 0.4952548146247864, + "learning_rate": 4.173441734417345e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.729067862033844, + "num_tokens": 7042639.0, + "step": 513 + }, + { + "entropy": 0.9143623113632202, + "epoch": 1.2536585365853659, + "grad_norm": 0.4703666567802429, + "learning_rate": 4.159891598915989e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7206144332885742, + "num_tokens": 7056131.0, + "step": 514 + }, + { + "entropy": 0.9043334126472473, + "epoch": 1.2560975609756098, + "grad_norm": 0.5099503397941589, + "learning_rate": 4.146341463414634e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7296978831291199, + "num_tokens": 7069717.0, + "step": 515 + }, + { + "entropy": 0.9112102389335632, + "epoch": 1.2585365853658537, + "grad_norm": 0.5303446054458618, + "learning_rate": 4.1327913279132794e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7211565971374512, + "num_tokens": 7083601.0, + "step": 516 + }, + { + "entropy": 0.8933310508728027, + "epoch": 1.2609756097560976, + "grad_norm": 0.49321338534355164, + "learning_rate": 4.119241192411925e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7297635078430176, + "num_tokens": 7097231.0, + "step": 517 + }, + { + "entropy": 0.8844630122184753, + "epoch": 1.2634146341463415, + "grad_norm": 0.46473798155784607, + "learning_rate": 4.10569105691057e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.726249098777771, + "num_tokens": 7111457.0, + "step": 518 + }, + { + "entropy": 0.8918311595916748, + "epoch": 1.2658536585365854, + "grad_norm": 0.474924772977829, + "learning_rate": 4.092140921409214e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7301933169364929, + "num_tokens": 7125231.0, + "step": 519 + }, + { + "entropy": 0.8817993402481079, + "epoch": 1.2682926829268293, + "grad_norm": 0.601287841796875, + "learning_rate": 4.078590785907859e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7307068109512329, + "num_tokens": 7138801.0, + "step": 520 + }, + { + "entropy": 0.8969420194625854, + "epoch": 1.2707317073170732, + "grad_norm": 0.5099979043006897, + "learning_rate": 4.0650406504065046e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7282543778419495, + "num_tokens": 7152245.0, + "step": 521 + }, + { + "entropy": 0.8839466571807861, + "epoch": 1.273170731707317, + "grad_norm": 0.45876964926719666, + "learning_rate": 4.051490514905149e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7337388396263123, + "num_tokens": 7166236.0, + "step": 522 + }, + { + "entropy": 0.901630163192749, + "epoch": 1.275609756097561, + "grad_norm": 0.539262592792511, + "learning_rate": 4.037940379403794e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7242372035980225, + "num_tokens": 7180148.0, + "step": 523 + }, + { + "entropy": 0.883599042892456, + "epoch": 1.278048780487805, + "grad_norm": 0.504664421081543, + "learning_rate": 4.024390243902439e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7333284616470337, + "num_tokens": 7193900.0, + "step": 524 + }, + { + "entropy": 0.8952665328979492, + "epoch": 1.2804878048780488, + "grad_norm": 0.4933059811592102, + "learning_rate": 4.0108401084010845e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7243708372116089, + "num_tokens": 7207982.0, + "step": 525 + }, + { + "entropy": 0.8998792171478271, + "epoch": 1.2829268292682927, + "grad_norm": 0.4748615026473999, + "learning_rate": 3.99728997289973e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7223044037818909, + "num_tokens": 7222197.0, + "step": 526 + }, + { + "entropy": 0.8993245363235474, + "epoch": 1.2853658536585366, + "grad_norm": 0.5399959683418274, + "learning_rate": 3.983739837398374e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7197199463844299, + "num_tokens": 7236067.0, + "step": 527 + }, + { + "entropy": 0.8935285806655884, + "epoch": 1.2878048780487805, + "grad_norm": 0.5220499038696289, + "learning_rate": 3.970189701897019e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7281142473220825, + "num_tokens": 7250019.0, + "step": 528 + }, + { + "entropy": 0.8838009834289551, + "epoch": 1.2902439024390244, + "grad_norm": 0.49501916766166687, + "learning_rate": 3.9566395663956644e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7248112559318542, + "num_tokens": 7263811.0, + "step": 529 + }, + { + "entropy": 0.895301103591919, + "epoch": 1.2926829268292683, + "grad_norm": 0.493888795375824, + "learning_rate": 3.943089430894309e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7277877330780029, + "num_tokens": 7277243.0, + "step": 530 + }, + { + "entropy": 0.9079329967498779, + "epoch": 1.2951219512195122, + "grad_norm": 0.47917982935905457, + "learning_rate": 3.929539295392954e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.724371075630188, + "num_tokens": 7291013.0, + "step": 531 + }, + { + "entropy": 0.9105004668235779, + "epoch": 1.2975609756097561, + "grad_norm": 0.5026032328605652, + "learning_rate": 3.915989159891599e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7229947447776794, + "num_tokens": 7304693.0, + "step": 532 + }, + { + "entropy": 0.8809376955032349, + "epoch": 1.3, + "grad_norm": 0.47273725271224976, + "learning_rate": 3.902439024390244e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7340049743652344, + "num_tokens": 7318010.0, + "step": 533 + }, + { + "entropy": 0.8872978687286377, + "epoch": 1.302439024390244, + "grad_norm": 0.46795758605003357, + "learning_rate": 3.88888888888889e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.726021409034729, + "num_tokens": 7331757.0, + "step": 534 + }, + { + "entropy": 0.8998485207557678, + "epoch": 1.3048780487804879, + "grad_norm": 0.5320788025856018, + "learning_rate": 3.875338753387534e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7274210453033447, + "num_tokens": 7345868.0, + "step": 535 + }, + { + "entropy": 0.8918326497077942, + "epoch": 1.3073170731707318, + "grad_norm": 0.5132240056991577, + "learning_rate": 3.861788617886179e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7323923110961914, + "num_tokens": 7359628.0, + "step": 536 + }, + { + "entropy": 0.8886317014694214, + "epoch": 1.3097560975609757, + "grad_norm": 0.5985807776451111, + "learning_rate": 3.848238482384824e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7288233637809753, + "num_tokens": 7373480.0, + "step": 537 + }, + { + "entropy": 0.8903034329414368, + "epoch": 1.3121951219512196, + "grad_norm": 0.549172580242157, + "learning_rate": 3.834688346883469e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7226969599723816, + "num_tokens": 7387304.0, + "step": 538 + }, + { + "entropy": 0.8851415514945984, + "epoch": 1.3146341463414635, + "grad_norm": 0.5681750774383545, + "learning_rate": 3.821138211382115e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7248347401618958, + "num_tokens": 7400632.0, + "step": 539 + }, + { + "entropy": 0.882635235786438, + "epoch": 1.3170731707317074, + "grad_norm": 0.5316335558891296, + "learning_rate": 3.8075880758807595e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7311773300170898, + "num_tokens": 7414408.0, + "step": 540 + }, + { + "entropy": 0.8970792293548584, + "epoch": 1.3195121951219513, + "grad_norm": 0.5417896509170532, + "learning_rate": 3.7940379403794043e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7224768996238708, + "num_tokens": 7428603.0, + "step": 541 + }, + { + "entropy": 0.9087656736373901, + "epoch": 1.3219512195121952, + "grad_norm": 0.5408928990364075, + "learning_rate": 3.780487804878049e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7309739589691162, + "num_tokens": 7442521.0, + "step": 542 + }, + { + "entropy": 0.9143552780151367, + "epoch": 1.324390243902439, + "grad_norm": 0.5758176445960999, + "learning_rate": 3.7669376693766942e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7155868411064148, + "num_tokens": 7456042.0, + "step": 543 + }, + { + "entropy": 0.8879578113555908, + "epoch": 1.326829268292683, + "grad_norm": 0.5441621541976929, + "learning_rate": 3.753387533875339e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7230884432792664, + "num_tokens": 7469398.0, + "step": 544 + }, + { + "entropy": 0.8851050734519958, + "epoch": 1.329268292682927, + "grad_norm": 0.5097270607948303, + "learning_rate": 3.7398373983739838e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7331685423851013, + "num_tokens": 7483168.0, + "step": 545 + }, + { + "entropy": 0.8914124965667725, + "epoch": 1.3317073170731708, + "grad_norm": 0.4741462171077728, + "learning_rate": 3.7262872628726286e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7253215909004211, + "num_tokens": 7497022.0, + "step": 546 + }, + { + "entropy": 0.8920881748199463, + "epoch": 1.3341463414634147, + "grad_norm": 0.544121503829956, + "learning_rate": 3.712737127371274e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7277756929397583, + "num_tokens": 7510233.0, + "step": 547 + }, + { + "entropy": 0.8978779315948486, + "epoch": 1.3365853658536586, + "grad_norm": 0.6094754934310913, + "learning_rate": 3.699186991869919e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.725739598274231, + "num_tokens": 7524006.0, + "step": 548 + }, + { + "entropy": 0.8959704637527466, + "epoch": 1.3390243902439025, + "grad_norm": 0.484652042388916, + "learning_rate": 3.685636856368564e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7273457050323486, + "num_tokens": 7537728.0, + "step": 549 + }, + { + "entropy": 0.8667632341384888, + "epoch": 1.3414634146341464, + "grad_norm": 0.5026461482048035, + "learning_rate": 3.672086720867209e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7339208722114563, + "num_tokens": 7551473.0, + "step": 550 + }, + { + "entropy": 0.8890734910964966, + "epoch": 1.34390243902439, + "grad_norm": 0.5242050886154175, + "learning_rate": 3.6585365853658537e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7381078600883484, + "num_tokens": 7564712.0, + "step": 551 + }, + { + "entropy": 0.8986338376998901, + "epoch": 1.346341463414634, + "grad_norm": 0.5245202779769897, + "learning_rate": 3.644986449864499e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7265607118606567, + "num_tokens": 7578007.0, + "step": 552 + }, + { + "entropy": 0.9158647656440735, + "epoch": 1.348780487804878, + "grad_norm": 0.5433186888694763, + "learning_rate": 3.6314363143631437e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7216664552688599, + "num_tokens": 7592017.0, + "step": 553 + }, + { + "entropy": 0.876274585723877, + "epoch": 1.3512195121951218, + "grad_norm": 0.5984988808631897, + "learning_rate": 3.6178861788617893e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.732636570930481, + "num_tokens": 7605524.0, + "step": 554 + }, + { + "entropy": 0.8815956711769104, + "epoch": 1.3536585365853657, + "grad_norm": 0.5312358140945435, + "learning_rate": 3.604336043360434e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7308400273323059, + "num_tokens": 7619123.0, + "step": 555 + }, + { + "entropy": 0.8996585607528687, + "epoch": 1.3560975609756096, + "grad_norm": 0.5443177819252014, + "learning_rate": 3.590785907859079e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.727543830871582, + "num_tokens": 7632888.0, + "step": 556 + }, + { + "entropy": 0.9065992832183838, + "epoch": 1.3585365853658535, + "grad_norm": 0.5264752507209778, + "learning_rate": 3.577235772357724e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7188636064529419, + "num_tokens": 7646385.0, + "step": 557 + }, + { + "entropy": 0.8936566114425659, + "epoch": 1.3609756097560974, + "grad_norm": 0.5454937815666199, + "learning_rate": 3.563685636856369e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7289999127388, + "num_tokens": 7659770.0, + "step": 558 + }, + { + "entropy": 0.8959879875183105, + "epoch": 1.3634146341463413, + "grad_norm": 0.4646812379360199, + "learning_rate": 3.5501355013550136e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7275198698043823, + "num_tokens": 7673765.0, + "step": 559 + }, + { + "entropy": 0.8788614273071289, + "epoch": 1.3658536585365852, + "grad_norm": 0.5636532306671143, + "learning_rate": 3.5365853658536588e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7269564270973206, + "num_tokens": 7687863.0, + "step": 560 + }, + { + "entropy": 0.8784700632095337, + "epoch": 1.3682926829268292, + "grad_norm": 0.4874018728733063, + "learning_rate": 3.5230352303523035e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7319697737693787, + "num_tokens": 7701911.0, + "step": 561 + }, + { + "entropy": 0.8921349048614502, + "epoch": 1.370731707317073, + "grad_norm": 0.5778629779815674, + "learning_rate": 3.509485094850949e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7272459864616394, + "num_tokens": 7715518.0, + "step": 562 + }, + { + "entropy": 0.8773680329322815, + "epoch": 1.373170731707317, + "grad_norm": 0.5318922996520996, + "learning_rate": 3.495934959349594e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7313464879989624, + "num_tokens": 7729392.0, + "step": 563 + }, + { + "entropy": 0.8853249549865723, + "epoch": 1.3756097560975609, + "grad_norm": 0.5359835624694824, + "learning_rate": 3.4823848238482387e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7314780354499817, + "num_tokens": 7742919.0, + "step": 564 + }, + { + "entropy": 0.886237621307373, + "epoch": 1.3780487804878048, + "grad_norm": 0.5415591597557068, + "learning_rate": 3.468834688346884e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7315119504928589, + "num_tokens": 7756295.0, + "step": 565 + }, + { + "entropy": 0.8944040536880493, + "epoch": 1.3804878048780487, + "grad_norm": 0.5151652097702026, + "learning_rate": 3.4552845528455287e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7207305431365967, + "num_tokens": 7770054.0, + "step": 566 + }, + { + "entropy": 0.8833385705947876, + "epoch": 1.3829268292682926, + "grad_norm": 0.4992096722126007, + "learning_rate": 3.4417344173441734e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7270156741142273, + "num_tokens": 7783862.0, + "step": 567 + }, + { + "entropy": 0.9013800621032715, + "epoch": 1.3853658536585365, + "grad_norm": 0.51569664478302, + "learning_rate": 3.4281842818428186e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7236117124557495, + "num_tokens": 7797312.0, + "step": 568 + }, + { + "entropy": 0.8924955725669861, + "epoch": 1.3878048780487804, + "grad_norm": 0.5286757349967957, + "learning_rate": 3.414634146341464e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7262544631958008, + "num_tokens": 7810720.0, + "step": 569 + }, + { + "entropy": 0.8890297412872314, + "epoch": 1.3902439024390243, + "grad_norm": 0.5955765247344971, + "learning_rate": 3.401084010840109e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7231224775314331, + "num_tokens": 7824251.0, + "step": 570 + }, + { + "entropy": 0.8626318573951721, + "epoch": 1.3926829268292682, + "grad_norm": 0.4987097680568695, + "learning_rate": 3.387533875338754e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7393428087234497, + "num_tokens": 7837779.0, + "step": 571 + }, + { + "entropy": 0.8848908543586731, + "epoch": 1.395121951219512, + "grad_norm": 0.4759664535522461, + "learning_rate": 3.3739837398373986e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7298733592033386, + "num_tokens": 7851533.0, + "step": 572 + }, + { + "entropy": 0.8894503116607666, + "epoch": 1.397560975609756, + "grad_norm": 0.4857938587665558, + "learning_rate": 3.3604336043360438e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7303609251976013, + "num_tokens": 7865208.0, + "step": 573 + }, + { + "entropy": 0.9084067344665527, + "epoch": 1.4, + "grad_norm": 0.5191188454627991, + "learning_rate": 3.3468834688346886e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7281098961830139, + "num_tokens": 7878689.0, + "step": 574 + }, + { + "entropy": 0.8691054582595825, + "epoch": 1.4024390243902438, + "grad_norm": 0.49921202659606934, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7362488508224487, + "num_tokens": 7892013.0, + "step": 575 + }, + { + "entropy": 0.8847055435180664, + "epoch": 1.4048780487804877, + "grad_norm": 0.4999859035015106, + "learning_rate": 3.3197831978319785e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7334121465682983, + "num_tokens": 7905563.0, + "step": 576 + }, + { + "entropy": 0.9092885255813599, + "epoch": 1.4073170731707316, + "grad_norm": 0.5342906713485718, + "learning_rate": 3.3062330623306237e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7145213484764099, + "num_tokens": 7919524.0, + "step": 577 + }, + { + "entropy": 0.8825151920318604, + "epoch": 1.4097560975609755, + "grad_norm": 0.5005236268043518, + "learning_rate": 3.292682926829269e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7287511825561523, + "num_tokens": 7933129.0, + "step": 578 + }, + { + "entropy": 0.8973416686058044, + "epoch": 1.4121951219512194, + "grad_norm": 0.5246015191078186, + "learning_rate": 3.2791327913279137e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7312119603157043, + "num_tokens": 7946784.0, + "step": 579 + }, + { + "entropy": 0.8993310928344727, + "epoch": 1.4146341463414633, + "grad_norm": 0.5140748023986816, + "learning_rate": 3.2655826558265585e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7212364673614502, + "num_tokens": 7961002.0, + "step": 580 + }, + { + "entropy": 0.8946079015731812, + "epoch": 1.4170731707317072, + "grad_norm": 0.6002617478370667, + "learning_rate": 3.2520325203252037e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7245283126831055, + "num_tokens": 7974798.0, + "step": 581 + }, + { + "entropy": 0.8892184495925903, + "epoch": 1.4195121951219511, + "grad_norm": 0.5291239619255066, + "learning_rate": 3.2384823848238484e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7324248552322388, + "num_tokens": 7988683.0, + "step": 582 + }, + { + "entropy": 0.8774310946464539, + "epoch": 1.421951219512195, + "grad_norm": 0.5266504287719727, + "learning_rate": 3.224932249322493e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7288246750831604, + "num_tokens": 8002406.0, + "step": 583 + }, + { + "entropy": 0.8831679821014404, + "epoch": 1.424390243902439, + "grad_norm": 0.5562174320220947, + "learning_rate": 3.211382113821139e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7314850091934204, + "num_tokens": 8015479.0, + "step": 584 + }, + { + "entropy": 0.9056977033615112, + "epoch": 1.4268292682926829, + "grad_norm": 0.5713416934013367, + "learning_rate": 3.1978319783197836e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7224880456924438, + "num_tokens": 8029080.0, + "step": 585 + }, + { + "entropy": 0.8931180238723755, + "epoch": 1.4292682926829268, + "grad_norm": 0.503696858882904, + "learning_rate": 3.1842818428184284e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7255293726921082, + "num_tokens": 8042602.0, + "step": 586 + }, + { + "entropy": 0.8890825510025024, + "epoch": 1.4317073170731707, + "grad_norm": 0.5064833760261536, + "learning_rate": 3.1707317073170736e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7287392020225525, + "num_tokens": 8056070.0, + "step": 587 + }, + { + "entropy": 0.8768372535705566, + "epoch": 1.4341463414634146, + "grad_norm": 0.5217829346656799, + "learning_rate": 3.1571815718157183e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7340074777603149, + "num_tokens": 8069733.0, + "step": 588 + }, + { + "entropy": 0.8988307118415833, + "epoch": 1.4365853658536585, + "grad_norm": 0.5267525315284729, + "learning_rate": 3.143631436314363e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7272128462791443, + "num_tokens": 8083408.0, + "step": 589 + }, + { + "entropy": 0.8802148103713989, + "epoch": 1.4390243902439024, + "grad_norm": 0.49351492524147034, + "learning_rate": 3.1300813008130083e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7272524833679199, + "num_tokens": 8096887.0, + "step": 590 + }, + { + "entropy": 0.8733420372009277, + "epoch": 1.4414634146341463, + "grad_norm": 0.6216485500335693, + "learning_rate": 3.116531165311653e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7351359128952026, + "num_tokens": 8110476.0, + "step": 591 + }, + { + "entropy": 0.8928675651550293, + "epoch": 1.4439024390243902, + "grad_norm": 0.5383737683296204, + "learning_rate": 3.1029810298102987e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7218061089515686, + "num_tokens": 8124511.0, + "step": 592 + }, + { + "entropy": 0.886874794960022, + "epoch": 1.446341463414634, + "grad_norm": 0.5209896564483643, + "learning_rate": 3.0894308943089435e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7260102033615112, + "num_tokens": 8138064.0, + "step": 593 + }, + { + "entropy": 0.8790051937103271, + "epoch": 1.448780487804878, + "grad_norm": 0.5296643376350403, + "learning_rate": 3.0758807588075882e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.729468584060669, + "num_tokens": 8151949.0, + "step": 594 + }, + { + "entropy": 0.88617342710495, + "epoch": 1.451219512195122, + "grad_norm": 0.5086992383003235, + "learning_rate": 3.0623306233062334e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7311142683029175, + "num_tokens": 8165454.0, + "step": 595 + }, + { + "entropy": 0.8895916938781738, + "epoch": 1.4536585365853658, + "grad_norm": 0.6400481462478638, + "learning_rate": 3.0487804878048782e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7254001498222351, + "num_tokens": 8179403.0, + "step": 596 + }, + { + "entropy": 0.8727394938468933, + "epoch": 1.4560975609756097, + "grad_norm": 0.5148288011550903, + "learning_rate": 3.035230352303523e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7360201478004456, + "num_tokens": 8193314.0, + "step": 597 + }, + { + "entropy": 0.8800617456436157, + "epoch": 1.4585365853658536, + "grad_norm": 0.555195152759552, + "learning_rate": 3.021680216802168e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7324080467224121, + "num_tokens": 8206731.0, + "step": 598 + }, + { + "entropy": 0.8606910705566406, + "epoch": 1.4609756097560975, + "grad_norm": 0.5864395499229431, + "learning_rate": 3.0081300813008134e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7344096899032593, + "num_tokens": 8220249.0, + "step": 599 + }, + { + "entropy": 0.8760560154914856, + "epoch": 1.4634146341463414, + "grad_norm": 0.5235090255737305, + "learning_rate": 2.9945799457994586e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7239957451820374, + "num_tokens": 8234330.0, + "step": 600 + }, + { + "entropy": 0.8802227973937988, + "epoch": 1.4658536585365853, + "grad_norm": 0.522666871547699, + "learning_rate": 2.9810298102981034e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7270638942718506, + "num_tokens": 8248276.0, + "step": 601 + }, + { + "entropy": 0.8891713619232178, + "epoch": 1.4682926829268292, + "grad_norm": 0.4861930012702942, + "learning_rate": 2.967479674796748e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.721794843673706, + "num_tokens": 8262332.0, + "step": 602 + }, + { + "entropy": 0.8843443393707275, + "epoch": 1.4707317073170731, + "grad_norm": 0.5338602662086487, + "learning_rate": 2.9539295392953933e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7304834127426147, + "num_tokens": 8275939.0, + "step": 603 + }, + { + "entropy": 0.8781300783157349, + "epoch": 1.473170731707317, + "grad_norm": 0.5555716156959534, + "learning_rate": 2.940379403794038e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.729243814945221, + "num_tokens": 8289192.0, + "step": 604 + }, + { + "entropy": 0.8766529560089111, + "epoch": 1.475609756097561, + "grad_norm": 0.6152108907699585, + "learning_rate": 2.926829268292683e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.726624071598053, + "num_tokens": 8302662.0, + "step": 605 + }, + { + "entropy": 0.9065438508987427, + "epoch": 1.4780487804878049, + "grad_norm": 0.4771234095096588, + "learning_rate": 2.913279132791328e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7299425601959229, + "num_tokens": 8316775.0, + "step": 606 + }, + { + "entropy": 0.8783807754516602, + "epoch": 1.4804878048780488, + "grad_norm": 0.5496251583099365, + "learning_rate": 2.8997289972899733e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.730849027633667, + "num_tokens": 8330289.0, + "step": 607 + }, + { + "entropy": 0.905014157295227, + "epoch": 1.4829268292682927, + "grad_norm": 0.5509771704673767, + "learning_rate": 2.8861788617886185e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7334033250808716, + "num_tokens": 8343636.0, + "step": 608 + }, + { + "entropy": 0.875667929649353, + "epoch": 1.4853658536585366, + "grad_norm": 0.5156928896903992, + "learning_rate": 2.8726287262872632e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7322340607643127, + "num_tokens": 8357358.0, + "step": 609 + }, + { + "entropy": 0.892330527305603, + "epoch": 1.4878048780487805, + "grad_norm": 0.5027339458465576, + "learning_rate": 2.859078590785908e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7313015460968018, + "num_tokens": 8370958.0, + "step": 610 + }, + { + "entropy": 0.8820470571517944, + "epoch": 1.4902439024390244, + "grad_norm": 0.48004716634750366, + "learning_rate": 2.845528455284553e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7273856401443481, + "num_tokens": 8384660.0, + "step": 611 + }, + { + "entropy": 0.8868808746337891, + "epoch": 1.4926829268292683, + "grad_norm": 0.4970702528953552, + "learning_rate": 2.831978319783198e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7320648431777954, + "num_tokens": 8398183.0, + "step": 612 + }, + { + "entropy": 0.8882201910018921, + "epoch": 1.4951219512195122, + "grad_norm": 0.5164826512336731, + "learning_rate": 2.8184281842818427e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7296938300132751, + "num_tokens": 8411754.0, + "step": 613 + }, + { + "entropy": 0.8878587484359741, + "epoch": 1.497560975609756, + "grad_norm": 0.5206930637359619, + "learning_rate": 2.8048780487804884e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7336882948875427, + "num_tokens": 8425104.0, + "step": 614 + }, + { + "entropy": 0.8786503076553345, + "epoch": 1.5, + "grad_norm": 0.4896388053894043, + "learning_rate": 2.791327913279133e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7345880270004272, + "num_tokens": 8438616.0, + "step": 615 + }, + { + "entropy": 0.8783841729164124, + "epoch": 1.502439024390244, + "grad_norm": 0.5044620037078857, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7318198680877686, + "num_tokens": 8452067.0, + "step": 616 + }, + { + "entropy": 0.8775740265846252, + "epoch": 1.5048780487804878, + "grad_norm": 0.5294600129127502, + "learning_rate": 2.764227642276423e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7268987894058228, + "num_tokens": 8466184.0, + "step": 617 + }, + { + "entropy": 0.8924665451049805, + "epoch": 1.5073170731707317, + "grad_norm": 0.6135087609291077, + "learning_rate": 2.750677506775068e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7222589254379272, + "num_tokens": 8479817.0, + "step": 618 + }, + { + "entropy": 0.8739150762557983, + "epoch": 1.5097560975609756, + "grad_norm": 0.4934721887111664, + "learning_rate": 2.7371273712737127e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.730917751789093, + "num_tokens": 8493301.0, + "step": 619 + }, + { + "entropy": 0.8825638890266418, + "epoch": 1.5121951219512195, + "grad_norm": 0.4939766526222229, + "learning_rate": 2.723577235772358e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.730536699295044, + "num_tokens": 8506881.0, + "step": 620 + }, + { + "entropy": 0.8853427767753601, + "epoch": 1.5146341463414634, + "grad_norm": 0.5113255977630615, + "learning_rate": 2.7100271002710026e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7268093824386597, + "num_tokens": 8520631.0, + "step": 621 + }, + { + "entropy": 0.882167398929596, + "epoch": 1.5170731707317073, + "grad_norm": 0.5379652976989746, + "learning_rate": 2.6964769647696482e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7336709499359131, + "num_tokens": 8534273.0, + "step": 622 + }, + { + "entropy": 0.8745768070220947, + "epoch": 1.5195121951219512, + "grad_norm": 0.47744041681289673, + "learning_rate": 2.682926829268293e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7373600602149963, + "num_tokens": 8547600.0, + "step": 623 + }, + { + "entropy": 0.8744478225708008, + "epoch": 1.5219512195121951, + "grad_norm": 0.48806169629096985, + "learning_rate": 2.669376693766938e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7403261661529541, + "num_tokens": 8561106.0, + "step": 624 + }, + { + "entropy": 0.8737915754318237, + "epoch": 1.524390243902439, + "grad_norm": 0.5594715476036072, + "learning_rate": 2.655826558265583e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7284777760505676, + "num_tokens": 8575003.0, + "step": 625 + }, + { + "entropy": 0.8842061161994934, + "epoch": 1.526829268292683, + "grad_norm": 0.5108963251113892, + "learning_rate": 2.6422764227642278e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7290242910385132, + "num_tokens": 8588773.0, + "step": 626 + }, + { + "entropy": 0.8844512701034546, + "epoch": 1.5292682926829269, + "grad_norm": 0.5226480960845947, + "learning_rate": 2.6287262872628725e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7259301543235779, + "num_tokens": 8602738.0, + "step": 627 + }, + { + "entropy": 0.8773095011711121, + "epoch": 1.5317073170731708, + "grad_norm": 0.530270516872406, + "learning_rate": 2.6151761517615177e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7299329042434692, + "num_tokens": 8616321.0, + "step": 628 + }, + { + "entropy": 0.883411169052124, + "epoch": 1.5341463414634147, + "grad_norm": 0.5226531624794006, + "learning_rate": 2.601626016260163e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7328861951828003, + "num_tokens": 8630054.0, + "step": 629 + }, + { + "entropy": 0.8826172947883606, + "epoch": 1.5365853658536586, + "grad_norm": 0.5279987454414368, + "learning_rate": 2.588075880758808e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.73455411195755, + "num_tokens": 8643941.0, + "step": 630 + }, + { + "entropy": 0.8816993236541748, + "epoch": 1.5390243902439025, + "grad_norm": 0.4635319113731384, + "learning_rate": 2.574525745257453e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7324573993682861, + "num_tokens": 8658037.0, + "step": 631 + }, + { + "entropy": 0.8873631954193115, + "epoch": 1.5414634146341464, + "grad_norm": 0.5214371681213379, + "learning_rate": 2.5609756097560977e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7238868474960327, + "num_tokens": 8672337.0, + "step": 632 + }, + { + "entropy": 0.8695685863494873, + "epoch": 1.5439024390243903, + "grad_norm": 0.522845447063446, + "learning_rate": 2.547425474254743e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7336251139640808, + "num_tokens": 8686063.0, + "step": 633 + }, + { + "entropy": 0.8817265033721924, + "epoch": 1.5463414634146342, + "grad_norm": 0.5790318250656128, + "learning_rate": 2.5338753387533876e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.735949695110321, + "num_tokens": 8699762.0, + "step": 634 + }, + { + "entropy": 0.8943449258804321, + "epoch": 1.548780487804878, + "grad_norm": 0.5461540818214417, + "learning_rate": 2.5203252032520324e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7255430817604065, + "num_tokens": 8713128.0, + "step": 635 + }, + { + "entropy": 0.87168288230896, + "epoch": 1.551219512195122, + "grad_norm": 0.50957190990448, + "learning_rate": 2.5067750677506776e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7287718653678894, + "num_tokens": 8726970.0, + "step": 636 + }, + { + "entropy": 0.8847887516021729, + "epoch": 1.553658536585366, + "grad_norm": 0.4929066002368927, + "learning_rate": 2.493224932249323e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7283905148506165, + "num_tokens": 8740649.0, + "step": 637 + }, + { + "entropy": 0.8921957015991211, + "epoch": 1.5560975609756098, + "grad_norm": 0.48889023065567017, + "learning_rate": 2.4796747967479676e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.726151168346405, + "num_tokens": 8754607.0, + "step": 638 + }, + { + "entropy": 0.8531458973884583, + "epoch": 1.5585365853658537, + "grad_norm": 0.5207129716873169, + "learning_rate": 2.4661246612466128e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7364062666893005, + "num_tokens": 8768269.0, + "step": 639 + }, + { + "entropy": 0.8851905465126038, + "epoch": 1.5609756097560976, + "grad_norm": 0.5498895049095154, + "learning_rate": 2.4525745257452575e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7295770645141602, + "num_tokens": 8782093.0, + "step": 640 + }, + { + "entropy": 0.8695498704910278, + "epoch": 1.5634146341463415, + "grad_norm": 0.4955178499221802, + "learning_rate": 2.4390243902439027e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7328490614891052, + "num_tokens": 8795738.0, + "step": 641 + }, + { + "entropy": 0.8819414973258972, + "epoch": 1.5658536585365854, + "grad_norm": 0.5156698822975159, + "learning_rate": 2.4254742547425475e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7310495376586914, + "num_tokens": 8809474.0, + "step": 642 + }, + { + "entropy": 0.8857159614562988, + "epoch": 1.5682926829268293, + "grad_norm": 0.5351740121841431, + "learning_rate": 2.4119241192411927e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7224662899971008, + "num_tokens": 8823146.0, + "step": 643 + }, + { + "entropy": 0.8779886364936829, + "epoch": 1.5707317073170732, + "grad_norm": 0.5071956515312195, + "learning_rate": 2.3983739837398375e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7339435815811157, + "num_tokens": 8836708.0, + "step": 644 + }, + { + "entropy": 0.9025756120681763, + "epoch": 1.5731707317073171, + "grad_norm": 0.5226132273674011, + "learning_rate": 2.3848238482384827e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7239575386047363, + "num_tokens": 8850298.0, + "step": 645 + }, + { + "entropy": 0.8937682509422302, + "epoch": 1.575609756097561, + "grad_norm": 0.5704896450042725, + "learning_rate": 2.371273712737128e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.721930980682373, + "num_tokens": 8864048.0, + "step": 646 + }, + { + "entropy": 0.8963336944580078, + "epoch": 1.578048780487805, + "grad_norm": 0.5066708326339722, + "learning_rate": 2.3577235772357727e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7227367162704468, + "num_tokens": 8877993.0, + "step": 647 + }, + { + "entropy": 0.8906478881835938, + "epoch": 1.5804878048780489, + "grad_norm": 0.47832056879997253, + "learning_rate": 2.3441734417344174e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.728906512260437, + "num_tokens": 8891141.0, + "step": 648 + }, + { + "entropy": 0.8920251131057739, + "epoch": 1.5829268292682928, + "grad_norm": 0.5094121098518372, + "learning_rate": 2.3306233062330626e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7286123037338257, + "num_tokens": 8905137.0, + "step": 649 + }, + { + "entropy": 0.9012845158576965, + "epoch": 1.5853658536585367, + "grad_norm": 0.5223912000656128, + "learning_rate": 2.317073170731708e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7305706739425659, + "num_tokens": 8919101.0, + "step": 650 + }, + { + "entropy": 0.8928289413452148, + "epoch": 1.5878048780487806, + "grad_norm": 0.5515636801719666, + "learning_rate": 2.3035230352303526e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.724931538105011, + "num_tokens": 8932630.0, + "step": 651 + }, + { + "entropy": 0.8886030912399292, + "epoch": 1.5902439024390245, + "grad_norm": 0.49004659056663513, + "learning_rate": 2.2899728997289974e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7325581312179565, + "num_tokens": 8946406.0, + "step": 652 + }, + { + "entropy": 0.8970048427581787, + "epoch": 1.5926829268292684, + "grad_norm": 0.5707719326019287, + "learning_rate": 2.2764227642276426e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.725605845451355, + "num_tokens": 8960493.0, + "step": 653 + }, + { + "entropy": 0.8691211938858032, + "epoch": 1.5951219512195123, + "grad_norm": 0.5564479827880859, + "learning_rate": 2.2628726287262878e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7343336343765259, + "num_tokens": 8974105.0, + "step": 654 + }, + { + "entropy": 0.8998392224311829, + "epoch": 1.5975609756097562, + "grad_norm": 0.5155988931655884, + "learning_rate": 2.2493224932249325e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7190064787864685, + "num_tokens": 8988011.0, + "step": 655 + }, + { + "entropy": 0.8792390823364258, + "epoch": 1.6, + "grad_norm": 0.548163115978241, + "learning_rate": 2.2357723577235773e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7319982051849365, + "num_tokens": 9001359.0, + "step": 656 + }, + { + "entropy": 0.9006599187850952, + "epoch": 1.602439024390244, + "grad_norm": 0.5012010931968689, + "learning_rate": 2.222222222222222e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7241604924201965, + "num_tokens": 9015133.0, + "step": 657 + }, + { + "entropy": 0.8827446699142456, + "epoch": 1.604878048780488, + "grad_norm": 0.562204122543335, + "learning_rate": 2.2086720867208673e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7299302220344543, + "num_tokens": 9028764.0, + "step": 658 + }, + { + "entropy": 0.8815902471542358, + "epoch": 1.6073170731707318, + "grad_norm": 0.5246950387954712, + "learning_rate": 2.1951219512195125e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7348490357398987, + "num_tokens": 9042459.0, + "step": 659 + }, + { + "entropy": 0.9139642715454102, + "epoch": 1.6097560975609757, + "grad_norm": 0.5334653258323669, + "learning_rate": 2.1815718157181572e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.724953293800354, + "num_tokens": 9056389.0, + "step": 660 + }, + { + "entropy": 0.8966798186302185, + "epoch": 1.6121951219512196, + "grad_norm": 0.5264739394187927, + "learning_rate": 2.1680216802168024e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.726248562335968, + "num_tokens": 9070341.0, + "step": 661 + }, + { + "entropy": 0.8982292413711548, + "epoch": 1.6146341463414635, + "grad_norm": 0.5185431838035583, + "learning_rate": 2.154471544715447e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7259421944618225, + "num_tokens": 9084022.0, + "step": 662 + }, + { + "entropy": 0.9089407920837402, + "epoch": 1.6170731707317074, + "grad_norm": 0.500324547290802, + "learning_rate": 2.1409214092140924e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7267783284187317, + "num_tokens": 9097829.0, + "step": 663 + }, + { + "entropy": 0.8799450397491455, + "epoch": 1.6195121951219513, + "grad_norm": 0.5376121401786804, + "learning_rate": 2.127371273712737e-06, + "loss": 0.861, + "mean_token_accuracy": 0.73411625623703, + "num_tokens": 9111129.0, + "step": 664 + }, + { + "entropy": 0.8976962566375732, + "epoch": 1.6219512195121952, + "grad_norm": 0.48157837986946106, + "learning_rate": 2.1138211382113824e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7249734997749329, + "num_tokens": 9125300.0, + "step": 665 + }, + { + "entropy": 0.8926389217376709, + "epoch": 1.6243902439024391, + "grad_norm": 0.5121070742607117, + "learning_rate": 2.100271002710027e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7253190279006958, + "num_tokens": 9138717.0, + "step": 666 + }, + { + "entropy": 0.8846919536590576, + "epoch": 1.626829268292683, + "grad_norm": 0.4743853807449341, + "learning_rate": 2.0867208672086723e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7336145639419556, + "num_tokens": 9152007.0, + "step": 667 + }, + { + "entropy": 0.8994081020355225, + "epoch": 1.629268292682927, + "grad_norm": 0.46882563829421997, + "learning_rate": 2.073170731707317e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7267160415649414, + "num_tokens": 9165251.0, + "step": 668 + }, + { + "entropy": 0.8839161396026611, + "epoch": 1.6317073170731708, + "grad_norm": 0.5257059335708618, + "learning_rate": 2.0596205962059623e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7352175712585449, + "num_tokens": 9178712.0, + "step": 669 + }, + { + "entropy": 0.8784297704696655, + "epoch": 1.6341463414634148, + "grad_norm": 0.519122838973999, + "learning_rate": 2.046070460704607e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7308939099311829, + "num_tokens": 9192611.0, + "step": 670 + }, + { + "entropy": 0.8900331258773804, + "epoch": 1.6365853658536587, + "grad_norm": 0.5005249977111816, + "learning_rate": 2.0325203252032523e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7258846163749695, + "num_tokens": 9206249.0, + "step": 671 + }, + { + "entropy": 0.8765691518783569, + "epoch": 1.6390243902439026, + "grad_norm": 0.5093433856964111, + "learning_rate": 2.018970189701897e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7366489171981812, + "num_tokens": 9219616.0, + "step": 672 + }, + { + "entropy": 0.873883843421936, + "epoch": 1.6414634146341465, + "grad_norm": 0.5480417609214783, + "learning_rate": 2.0054200542005423e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7316689491271973, + "num_tokens": 9233611.0, + "step": 673 + }, + { + "entropy": 0.8792591094970703, + "epoch": 1.6439024390243904, + "grad_norm": 0.5551111698150635, + "learning_rate": 1.991869918699187e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7298508286476135, + "num_tokens": 9247238.0, + "step": 674 + }, + { + "entropy": 0.8775755167007446, + "epoch": 1.6463414634146343, + "grad_norm": 0.5572032928466797, + "learning_rate": 1.9783197831978322e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7310796976089478, + "num_tokens": 9261128.0, + "step": 675 + }, + { + "entropy": 0.8921653032302856, + "epoch": 1.6487804878048782, + "grad_norm": 0.5530563592910767, + "learning_rate": 1.964769647696477e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7238720655441284, + "num_tokens": 9275152.0, + "step": 676 + }, + { + "entropy": 0.8693351745605469, + "epoch": 1.651219512195122, + "grad_norm": 0.5165068507194519, + "learning_rate": 1.951219512195122e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7313920855522156, + "num_tokens": 9288630.0, + "step": 677 + }, + { + "entropy": 0.8753268122673035, + "epoch": 1.653658536585366, + "grad_norm": 0.4899144470691681, + "learning_rate": 1.937669376693767e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7322003841400146, + "num_tokens": 9302298.0, + "step": 678 + }, + { + "entropy": 0.8960176706314087, + "epoch": 1.65609756097561, + "grad_norm": 0.5682179927825928, + "learning_rate": 1.924119241192412e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7215538620948792, + "num_tokens": 9316112.0, + "step": 679 + }, + { + "entropy": 0.875866174697876, + "epoch": 1.6585365853658538, + "grad_norm": 0.48258838057518005, + "learning_rate": 1.9105691056910574e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7342554330825806, + "num_tokens": 9329720.0, + "step": 680 + }, + { + "entropy": 0.8869270086288452, + "epoch": 1.6609756097560977, + "grad_norm": 0.5566992163658142, + "learning_rate": 1.8970189701897021e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7254682779312134, + "num_tokens": 9343137.0, + "step": 681 + }, + { + "entropy": 0.8802525997161865, + "epoch": 1.6634146341463416, + "grad_norm": 0.5332781076431274, + "learning_rate": 1.8834688346883471e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.733722984790802, + "num_tokens": 9356669.0, + "step": 682 + }, + { + "entropy": 0.8921730518341064, + "epoch": 1.6658536585365855, + "grad_norm": 0.5774820446968079, + "learning_rate": 1.8699186991869919e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7285693883895874, + "num_tokens": 9370637.0, + "step": 683 + }, + { + "entropy": 0.8870189785957336, + "epoch": 1.6682926829268294, + "grad_norm": 0.5511258840560913, + "learning_rate": 1.856368563685637e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.729683518409729, + "num_tokens": 9384681.0, + "step": 684 + }, + { + "entropy": 0.8922406435012817, + "epoch": 1.6707317073170733, + "grad_norm": 0.6266551613807678, + "learning_rate": 1.842818428184282e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7260046005249023, + "num_tokens": 9398533.0, + "step": 685 + }, + { + "entropy": 0.877333402633667, + "epoch": 1.6731707317073172, + "grad_norm": 0.5099342465400696, + "learning_rate": 1.8292682926829268e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7367627024650574, + "num_tokens": 9411807.0, + "step": 686 + }, + { + "entropy": 0.8942214250564575, + "epoch": 1.6756097560975611, + "grad_norm": 0.6306934356689453, + "learning_rate": 1.8157181571815718e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7317629456520081, + "num_tokens": 9425408.0, + "step": 687 + }, + { + "entropy": 0.892951250076294, + "epoch": 1.678048780487805, + "grad_norm": 0.5098548531532288, + "learning_rate": 1.802168021680217e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7273315191268921, + "num_tokens": 9439342.0, + "step": 688 + }, + { + "entropy": 0.8908568620681763, + "epoch": 1.680487804878049, + "grad_norm": 0.5078380703926086, + "learning_rate": 1.788617886178862e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7286223769187927, + "num_tokens": 9453099.0, + "step": 689 + }, + { + "entropy": 0.8855090737342834, + "epoch": 1.6829268292682928, + "grad_norm": 0.547004222869873, + "learning_rate": 1.7750677506775068e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7313100695610046, + "num_tokens": 9467053.0, + "step": 690 + }, + { + "entropy": 0.906796932220459, + "epoch": 1.6853658536585368, + "grad_norm": 0.5479020476341248, + "learning_rate": 1.7615176151761518e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7282412052154541, + "num_tokens": 9480868.0, + "step": 691 + }, + { + "entropy": 0.893460750579834, + "epoch": 1.6878048780487804, + "grad_norm": 0.5498223304748535, + "learning_rate": 1.747967479674797e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7282806038856506, + "num_tokens": 9495226.0, + "step": 692 + }, + { + "entropy": 0.88077712059021, + "epoch": 1.6902439024390243, + "grad_norm": 0.646657407283783, + "learning_rate": 1.734417344173442e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7274883389472961, + "num_tokens": 9508735.0, + "step": 693 + }, + { + "entropy": 0.8923465013504028, + "epoch": 1.6926829268292682, + "grad_norm": 0.6305309534072876, + "learning_rate": 1.7208672086720867e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7270920872688293, + "num_tokens": 9522338.0, + "step": 694 + }, + { + "entropy": 0.8856834173202515, + "epoch": 1.6951219512195121, + "grad_norm": 0.5028641819953918, + "learning_rate": 1.707317073170732e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7312750220298767, + "num_tokens": 9536346.0, + "step": 695 + }, + { + "entropy": 0.8918694257736206, + "epoch": 1.697560975609756, + "grad_norm": 0.5215321183204651, + "learning_rate": 1.693766937669377e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7307663559913635, + "num_tokens": 9549867.0, + "step": 696 + }, + { + "entropy": 0.891266942024231, + "epoch": 1.7, + "grad_norm": 0.5111386775970459, + "learning_rate": 1.6802168021680219e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7272476553916931, + "num_tokens": 9564398.0, + "step": 697 + }, + { + "entropy": 0.861628532409668, + "epoch": 1.7024390243902439, + "grad_norm": 0.5218578577041626, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7348657846450806, + "num_tokens": 9578422.0, + "step": 698 + }, + { + "entropy": 0.8941474556922913, + "epoch": 1.7048780487804878, + "grad_norm": 0.5603755712509155, + "learning_rate": 1.6531165311653119e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7347380518913269, + "num_tokens": 9592296.0, + "step": 699 + }, + { + "entropy": 0.8734927177429199, + "epoch": 1.7073170731707317, + "grad_norm": 0.49485158920288086, + "learning_rate": 1.6395663956639568e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.734604001045227, + "num_tokens": 9606163.0, + "step": 700 + }, + { + "entropy": 0.8836231827735901, + "epoch": 1.7097560975609756, + "grad_norm": 0.5523728728294373, + "learning_rate": 1.6260162601626018e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7232486009597778, + "num_tokens": 9619711.0, + "step": 701 + }, + { + "entropy": 0.8645951747894287, + "epoch": 1.7121951219512195, + "grad_norm": 0.5394639372825623, + "learning_rate": 1.6124661246612466e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7320195436477661, + "num_tokens": 9633631.0, + "step": 702 + }, + { + "entropy": 0.8609342575073242, + "epoch": 1.7146341463414634, + "grad_norm": 0.5509612560272217, + "learning_rate": 1.5989159891598918e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7359808087348938, + "num_tokens": 9647396.0, + "step": 703 + }, + { + "entropy": 0.8949425220489502, + "epoch": 1.7170731707317073, + "grad_norm": 0.5197579860687256, + "learning_rate": 1.5853658536585368e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7241796851158142, + "num_tokens": 9661461.0, + "step": 704 + }, + { + "entropy": 0.8748670816421509, + "epoch": 1.7195121951219512, + "grad_norm": 0.5404813289642334, + "learning_rate": 1.5718157181571816e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7258481979370117, + "num_tokens": 9675389.0, + "step": 705 + }, + { + "entropy": 0.8763769865036011, + "epoch": 1.721951219512195, + "grad_norm": 0.5087519288063049, + "learning_rate": 1.5582655826558265e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7309332489967346, + "num_tokens": 9689238.0, + "step": 706 + }, + { + "entropy": 0.873427152633667, + "epoch": 1.724390243902439, + "grad_norm": 0.5565934181213379, + "learning_rate": 1.5447154471544717e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7338494062423706, + "num_tokens": 9703077.0, + "step": 707 + }, + { + "entropy": 0.874050498008728, + "epoch": 1.726829268292683, + "grad_norm": 0.5677209496498108, + "learning_rate": 1.5311653116531167e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7351242303848267, + "num_tokens": 9716941.0, + "step": 708 + }, + { + "entropy": 0.8907315731048584, + "epoch": 1.7292682926829268, + "grad_norm": 0.560185432434082, + "learning_rate": 1.5176151761517615e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7275949120521545, + "num_tokens": 9730782.0, + "step": 709 + }, + { + "entropy": 0.8864357471466064, + "epoch": 1.7317073170731707, + "grad_norm": 0.5551504492759705, + "learning_rate": 1.5040650406504067e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7227503061294556, + "num_tokens": 9745211.0, + "step": 710 + }, + { + "entropy": 0.8754174709320068, + "epoch": 1.7341463414634146, + "grad_norm": 0.4636332094669342, + "learning_rate": 1.4905149051490517e-06, + "loss": 0.852, + "mean_token_accuracy": 0.735327422618866, + "num_tokens": 9759369.0, + "step": 711 + }, + { + "entropy": 0.8752076625823975, + "epoch": 1.7365853658536585, + "grad_norm": 0.5738482475280762, + "learning_rate": 1.4769647696476967e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7349178791046143, + "num_tokens": 9773143.0, + "step": 712 + }, + { + "entropy": 0.8808815479278564, + "epoch": 1.7390243902439024, + "grad_norm": 0.4846593141555786, + "learning_rate": 1.4634146341463414e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7282733917236328, + "num_tokens": 9787059.0, + "step": 713 + }, + { + "entropy": 0.8802102208137512, + "epoch": 1.7414634146341463, + "grad_norm": 0.5256415605545044, + "learning_rate": 1.4498644986449866e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7356405258178711, + "num_tokens": 9800829.0, + "step": 714 + }, + { + "entropy": 0.8865002393722534, + "epoch": 1.7439024390243902, + "grad_norm": 0.6085943579673767, + "learning_rate": 1.4363143631436316e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7306849360466003, + "num_tokens": 9814992.0, + "step": 715 + }, + { + "entropy": 0.8770885467529297, + "epoch": 1.7463414634146341, + "grad_norm": 0.5122337341308594, + "learning_rate": 1.4227642276422766e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7321338057518005, + "num_tokens": 9828847.0, + "step": 716 + }, + { + "entropy": 0.8982193470001221, + "epoch": 1.748780487804878, + "grad_norm": 0.5098617672920227, + "learning_rate": 1.4092140921409214e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7284758687019348, + "num_tokens": 9842766.0, + "step": 717 + }, + { + "entropy": 0.8702014088630676, + "epoch": 1.751219512195122, + "grad_norm": 0.5830314755439758, + "learning_rate": 1.3956639566395666e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7296539545059204, + "num_tokens": 9856335.0, + "step": 718 + }, + { + "entropy": 0.8794569969177246, + "epoch": 1.7536585365853659, + "grad_norm": 0.5575631260871887, + "learning_rate": 1.3821138211382116e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.734868049621582, + "num_tokens": 9870295.0, + "step": 719 + }, + { + "entropy": 0.8718886375427246, + "epoch": 1.7560975609756098, + "grad_norm": 0.7530406713485718, + "learning_rate": 1.3685636856368563e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7317982912063599, + "num_tokens": 9884252.0, + "step": 720 + }, + { + "entropy": 0.8773154020309448, + "epoch": 1.7585365853658537, + "grad_norm": 0.5159140825271606, + "learning_rate": 1.3550135501355013e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7374392151832581, + "num_tokens": 9897842.0, + "step": 721 + }, + { + "entropy": 0.8738135099411011, + "epoch": 1.7609756097560976, + "grad_norm": 0.5728605389595032, + "learning_rate": 1.3414634146341465e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7306110262870789, + "num_tokens": 9910984.0, + "step": 722 + }, + { + "entropy": 0.8791817426681519, + "epoch": 1.7634146341463415, + "grad_norm": 0.5503786206245422, + "learning_rate": 1.3279132791327915e-06, + "loss": 0.871, + "mean_token_accuracy": 0.72954261302948, + "num_tokens": 9924577.0, + "step": 723 + }, + { + "entropy": 0.8843265771865845, + "epoch": 1.7658536585365854, + "grad_norm": 0.5542715787887573, + "learning_rate": 1.3143631436314363e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7308476567268372, + "num_tokens": 9938325.0, + "step": 724 + }, + { + "entropy": 0.865561306476593, + "epoch": 1.7682926829268293, + "grad_norm": 0.5139968991279602, + "learning_rate": 1.3008130081300815e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7372664213180542, + "num_tokens": 9951613.0, + "step": 725 + }, + { + "entropy": 0.8823000192642212, + "epoch": 1.7707317073170732, + "grad_norm": 0.5414228439331055, + "learning_rate": 1.2872628726287264e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7271595001220703, + "num_tokens": 9965278.0, + "step": 726 + }, + { + "entropy": 0.8546561598777771, + "epoch": 1.773170731707317, + "grad_norm": 0.6240255832672119, + "learning_rate": 1.2737127371273714e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7430485486984253, + "num_tokens": 9979176.0, + "step": 727 + }, + { + "entropy": 0.8734099268913269, + "epoch": 1.775609756097561, + "grad_norm": 0.5208917856216431, + "learning_rate": 1.2601626016260162e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7244963645935059, + "num_tokens": 9993141.0, + "step": 728 + }, + { + "entropy": 0.8741912841796875, + "epoch": 1.778048780487805, + "grad_norm": 0.5388087630271912, + "learning_rate": 1.2466124661246614e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7358125448226929, + "num_tokens": 10006602.0, + "step": 729 + }, + { + "entropy": 0.8812603950500488, + "epoch": 1.7804878048780488, + "grad_norm": 0.5764947533607483, + "learning_rate": 1.2330623306233064e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.727005124092102, + "num_tokens": 10020208.0, + "step": 730 + }, + { + "entropy": 0.883576512336731, + "epoch": 1.7829268292682927, + "grad_norm": 0.562300443649292, + "learning_rate": 1.2195121951219514e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7265163064002991, + "num_tokens": 10033925.0, + "step": 731 + }, + { + "entropy": 0.864028811454773, + "epoch": 1.7853658536585366, + "grad_norm": 0.5029560327529907, + "learning_rate": 1.2059620596205964e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7317820191383362, + "num_tokens": 10047650.0, + "step": 732 + }, + { + "entropy": 0.8827657699584961, + "epoch": 1.7878048780487805, + "grad_norm": 0.5882710218429565, + "learning_rate": 1.1924119241192413e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7273064255714417, + "num_tokens": 10061150.0, + "step": 733 + }, + { + "entropy": 0.8744525909423828, + "epoch": 1.7902439024390244, + "grad_norm": 0.5701075792312622, + "learning_rate": 1.1788617886178863e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7322956919670105, + "num_tokens": 10074722.0, + "step": 734 + }, + { + "entropy": 0.8803761005401611, + "epoch": 1.7926829268292683, + "grad_norm": 0.5309422612190247, + "learning_rate": 1.1653116531165313e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7303227782249451, + "num_tokens": 10088091.0, + "step": 735 + }, + { + "entropy": 0.8677628040313721, + "epoch": 1.7951219512195122, + "grad_norm": 0.5347625613212585, + "learning_rate": 1.1517615176151763e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7351138591766357, + "num_tokens": 10101811.0, + "step": 736 + }, + { + "entropy": 0.8758652806282043, + "epoch": 1.7975609756097561, + "grad_norm": 0.5840176343917847, + "learning_rate": 1.1382113821138213e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7307803630828857, + "num_tokens": 10115667.0, + "step": 737 + }, + { + "entropy": 0.8817602396011353, + "epoch": 1.8, + "grad_norm": 0.574613630771637, + "learning_rate": 1.1246612466124663e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7227194309234619, + "num_tokens": 10128980.0, + "step": 738 + }, + { + "entropy": 0.8836100697517395, + "epoch": 1.802439024390244, + "grad_norm": 0.5252755880355835, + "learning_rate": 1.111111111111111e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7268431186676025, + "num_tokens": 10142750.0, + "step": 739 + }, + { + "entropy": 0.8818948268890381, + "epoch": 1.8048780487804879, + "grad_norm": 0.6356107592582703, + "learning_rate": 1.0975609756097562e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7285158038139343, + "num_tokens": 10155857.0, + "step": 740 + }, + { + "entropy": 0.883324384689331, + "epoch": 1.8073170731707318, + "grad_norm": 0.6597152352333069, + "learning_rate": 1.0840108401084012e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7296566963195801, + "num_tokens": 10169563.0, + "step": 741 + }, + { + "entropy": 0.8747758269309998, + "epoch": 1.8097560975609757, + "grad_norm": 0.5430500507354736, + "learning_rate": 1.0704607046070462e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7310535311698914, + "num_tokens": 10182972.0, + "step": 742 + }, + { + "entropy": 0.8945927023887634, + "epoch": 1.8121951219512196, + "grad_norm": 0.5791915655136108, + "learning_rate": 1.0569105691056912e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7228506207466125, + "num_tokens": 10196515.0, + "step": 743 + }, + { + "entropy": 0.889089047908783, + "epoch": 1.8146341463414632, + "grad_norm": 0.5456008315086365, + "learning_rate": 1.0433604336043362e-06, + "loss": 0.877, + "mean_token_accuracy": 0.728965163230896, + "num_tokens": 10210282.0, + "step": 744 + }, + { + "entropy": 0.8740544319152832, + "epoch": 1.8170731707317072, + "grad_norm": 0.660473644733429, + "learning_rate": 1.0298102981029812e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7304914593696594, + "num_tokens": 10224420.0, + "step": 745 + }, + { + "entropy": 0.8994358777999878, + "epoch": 1.819512195121951, + "grad_norm": 0.5640230774879456, + "learning_rate": 1.0162601626016261e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7228577733039856, + "num_tokens": 10238335.0, + "step": 746 + }, + { + "entropy": 0.8852430582046509, + "epoch": 1.821951219512195, + "grad_norm": 0.5081997513771057, + "learning_rate": 1.0027100271002711e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7313060164451599, + "num_tokens": 10251965.0, + "step": 747 + }, + { + "entropy": 0.8647916316986084, + "epoch": 1.8243902439024389, + "grad_norm": 0.5371124744415283, + "learning_rate": 9.891598915989161e-07, + "loss": 0.8464, + "mean_token_accuracy": 0.7346433401107788, + "num_tokens": 10265314.0, + "step": 748 + }, + { + "entropy": 0.8955444097518921, + "epoch": 1.8268292682926828, + "grad_norm": 0.5369889736175537, + "learning_rate": 9.75609756097561e-07, + "loss": 0.8706, + "mean_token_accuracy": 0.731004536151886, + "num_tokens": 10279386.0, + "step": 749 + }, + { + "entropy": 0.8667882680892944, + "epoch": 1.8292682926829267, + "grad_norm": 0.5496275424957275, + "learning_rate": 9.62059620596206e-07, + "loss": 0.8492, + "mean_token_accuracy": 0.7305599451065063, + "num_tokens": 10293368.0, + "step": 750 + }, + { + "entropy": 0.8811326026916504, + "epoch": 1.8317073170731706, + "grad_norm": 0.5627800822257996, + "learning_rate": 9.485094850948511e-07, + "loss": 0.85, + "mean_token_accuracy": 0.7355487942695618, + "num_tokens": 10306895.0, + "step": 751 + }, + { + "entropy": 0.8759801387786865, + "epoch": 1.8341463414634145, + "grad_norm": 0.5209669470787048, + "learning_rate": 9.349593495934959e-07, + "loss": 0.8723, + "mean_token_accuracy": 0.7327739596366882, + "num_tokens": 10320379.0, + "step": 752 + }, + { + "entropy": 0.9030628204345703, + "epoch": 1.8365853658536584, + "grad_norm": 0.5620582699775696, + "learning_rate": 9.21409214092141e-07, + "loss": 0.853, + "mean_token_accuracy": 0.7353246212005615, + "num_tokens": 10333887.0, + "step": 753 + }, + { + "entropy": 0.8929505348205566, + "epoch": 1.8390243902439023, + "grad_norm": 0.5281407833099365, + "learning_rate": 9.078590785907859e-07, + "loss": 0.8845, + "mean_token_accuracy": 0.7236402630805969, + "num_tokens": 10347968.0, + "step": 754 + }, + { + "entropy": 0.8875263929367065, + "epoch": 1.8414634146341462, + "grad_norm": 0.5603488087654114, + "learning_rate": 8.94308943089431e-07, + "loss": 0.8783, + "mean_token_accuracy": 0.730212390422821, + "num_tokens": 10361591.0, + "step": 755 + }, + { + "entropy": 0.8743438720703125, + "epoch": 1.84390243902439, + "grad_norm": 0.515800416469574, + "learning_rate": 8.807588075880759e-07, + "loss": 0.8518, + "mean_token_accuracy": 0.7341781258583069, + "num_tokens": 10375180.0, + "step": 756 + }, + { + "entropy": 0.8860392570495605, + "epoch": 1.846341463414634, + "grad_norm": 0.5203832387924194, + "learning_rate": 8.67208672086721e-07, + "loss": 0.8708, + "mean_token_accuracy": 0.729315459728241, + "num_tokens": 10389615.0, + "step": 757 + }, + { + "entropy": 0.9075171947479248, + "epoch": 1.848780487804878, + "grad_norm": 0.5129781365394592, + "learning_rate": 8.53658536585366e-07, + "loss": 0.8972, + "mean_token_accuracy": 0.723143458366394, + "num_tokens": 10403501.0, + "step": 758 + }, + { + "entropy": 0.8773412108421326, + "epoch": 1.8512195121951218, + "grad_norm": 0.5214099884033203, + "learning_rate": 8.401084010840109e-07, + "loss": 0.8701, + "mean_token_accuracy": 0.7358790040016174, + "num_tokens": 10416742.0, + "step": 759 + }, + { + "entropy": 0.9007905721664429, + "epoch": 1.8536585365853657, + "grad_norm": 0.5532057285308838, + "learning_rate": 8.265582655826559e-07, + "loss": 0.885, + "mean_token_accuracy": 0.727091908454895, + "num_tokens": 10430334.0, + "step": 760 + }, + { + "entropy": 0.875899076461792, + "epoch": 1.8560975609756096, + "grad_norm": 0.4984937608242035, + "learning_rate": 8.130081300813009e-07, + "loss": 0.8598, + "mean_token_accuracy": 0.7356676459312439, + "num_tokens": 10444287.0, + "step": 761 + }, + { + "entropy": 0.879464328289032, + "epoch": 1.8585365853658535, + "grad_norm": 0.5563964247703552, + "learning_rate": 7.994579945799459e-07, + "loss": 0.8753, + "mean_token_accuracy": 0.7309806942939758, + "num_tokens": 10457763.0, + "step": 762 + }, + { + "entropy": 0.8621103167533875, + "epoch": 1.8609756097560974, + "grad_norm": 0.614388644695282, + "learning_rate": 7.859078590785908e-07, + "loss": 0.8607, + "mean_token_accuracy": 0.7339669466018677, + "num_tokens": 10471563.0, + "step": 763 + }, + { + "entropy": 0.8521906137466431, + "epoch": 1.8634146341463413, + "grad_norm": 0.6084858775138855, + "learning_rate": 7.723577235772359e-07, + "loss": 0.8401, + "mean_token_accuracy": 0.7348809242248535, + "num_tokens": 10484890.0, + "step": 764 + }, + { + "entropy": 0.8763301372528076, + "epoch": 1.8658536585365852, + "grad_norm": 0.5668750405311584, + "learning_rate": 7.588075880758807e-07, + "loss": 0.8895, + "mean_token_accuracy": 0.7239844799041748, + "num_tokens": 10499061.0, + "step": 765 + }, + { + "entropy": 0.8991358280181885, + "epoch": 1.8682926829268292, + "grad_norm": 0.48762112855911255, + "learning_rate": 7.452574525745258e-07, + "loss": 0.8837, + "mean_token_accuracy": 0.7247573733329773, + "num_tokens": 10512883.0, + "step": 766 + }, + { + "entropy": 0.8535343408584595, + "epoch": 1.870731707317073, + "grad_norm": 0.5635075569152832, + "learning_rate": 7.317073170731707e-07, + "loss": 0.8385, + "mean_token_accuracy": 0.7378661632537842, + "num_tokens": 10526930.0, + "step": 767 + }, + { + "entropy": 0.8730718493461609, + "epoch": 1.873170731707317, + "grad_norm": 0.5588259696960449, + "learning_rate": 7.181571815718158e-07, + "loss": 0.859, + "mean_token_accuracy": 0.734580397605896, + "num_tokens": 10540792.0, + "step": 768 + }, + { + "entropy": 0.863853931427002, + "epoch": 1.8756097560975609, + "grad_norm": 0.5849318504333496, + "learning_rate": 7.046070460704607e-07, + "loss": 0.8562, + "mean_token_accuracy": 0.7336525321006775, + "num_tokens": 10554174.0, + "step": 769 + }, + { + "entropy": 0.8770521283149719, + "epoch": 1.8780487804878048, + "grad_norm": 0.5464377999305725, + "learning_rate": 6.910569105691058e-07, + "loss": 0.8543, + "mean_token_accuracy": 0.7362645864486694, + "num_tokens": 10567495.0, + "step": 770 + }, + { + "entropy": 0.887363076210022, + "epoch": 1.8804878048780487, + "grad_norm": 0.5217662453651428, + "learning_rate": 6.775067750677507e-07, + "loss": 0.8955, + "mean_token_accuracy": 0.7222620844841003, + "num_tokens": 10581445.0, + "step": 771 + }, + { + "entropy": 0.8654752969741821, + "epoch": 1.8829268292682926, + "grad_norm": 0.5389561653137207, + "learning_rate": 6.639566395663957e-07, + "loss": 0.8468, + "mean_token_accuracy": 0.7368183732032776, + "num_tokens": 10594756.0, + "step": 772 + }, + { + "entropy": 0.9031565189361572, + "epoch": 1.8853658536585365, + "grad_norm": 0.5442070960998535, + "learning_rate": 6.504065040650407e-07, + "loss": 0.9152, + "mean_token_accuracy": 0.7197484970092773, + "num_tokens": 10608449.0, + "step": 773 + }, + { + "entropy": 0.8798193335533142, + "epoch": 1.8878048780487804, + "grad_norm": 0.5139967203140259, + "learning_rate": 6.368563685636857e-07, + "loss": 0.8465, + "mean_token_accuracy": 0.7399256825447083, + "num_tokens": 10622188.0, + "step": 774 + }, + { + "entropy": 0.8660321235656738, + "epoch": 1.8902439024390243, + "grad_norm": 0.5604297518730164, + "learning_rate": 6.233062330623307e-07, + "loss": 0.8621, + "mean_token_accuracy": 0.7294635772705078, + "num_tokens": 10635607.0, + "step": 775 + }, + { + "entropy": 0.8818001747131348, + "epoch": 1.8926829268292682, + "grad_norm": 0.5093931555747986, + "learning_rate": 6.097560975609757e-07, + "loss": 0.8791, + "mean_token_accuracy": 0.7306883335113525, + "num_tokens": 10649410.0, + "step": 776 + }, + { + "entropy": 0.879349946975708, + "epoch": 1.895121951219512, + "grad_norm": 0.5525664687156677, + "learning_rate": 5.962059620596207e-07, + "loss": 0.859, + "mean_token_accuracy": 0.7347868084907532, + "num_tokens": 10662819.0, + "step": 777 + }, + { + "entropy": 0.8722898960113525, + "epoch": 1.897560975609756, + "grad_norm": 0.5286668539047241, + "learning_rate": 5.826558265582657e-07, + "loss": 0.8698, + "mean_token_accuracy": 0.7305213809013367, + "num_tokens": 10676491.0, + "step": 778 + }, + { + "entropy": 0.8877685070037842, + "epoch": 1.9, + "grad_norm": 0.531743586063385, + "learning_rate": 5.691056910569106e-07, + "loss": 0.8699, + "mean_token_accuracy": 0.7307401299476624, + "num_tokens": 10689721.0, + "step": 779 + }, + { + "entropy": 0.8780038356781006, + "epoch": 1.9024390243902438, + "grad_norm": 0.5183002948760986, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8738, + "mean_token_accuracy": 0.72763991355896, + "num_tokens": 10703601.0, + "step": 780 + }, + { + "entropy": 0.8894438743591309, + "epoch": 1.9048780487804877, + "grad_norm": 0.4965713918209076, + "learning_rate": 5.420054200542006e-07, + "loss": 0.8886, + "mean_token_accuracy": 0.7262105941772461, + "num_tokens": 10717226.0, + "step": 781 + }, + { + "entropy": 0.8759099245071411, + "epoch": 1.9073170731707316, + "grad_norm": 0.504848301410675, + "learning_rate": 5.284552845528456e-07, + "loss": 0.871, + "mean_token_accuracy": 0.7337351441383362, + "num_tokens": 10731014.0, + "step": 782 + }, + { + "entropy": 0.8838939666748047, + "epoch": 1.9097560975609755, + "grad_norm": 0.5675578713417053, + "learning_rate": 5.149051490514906e-07, + "loss": 0.873, + "mean_token_accuracy": 0.731378436088562, + "num_tokens": 10745046.0, + "step": 783 + }, + { + "entropy": 0.8941045999526978, + "epoch": 1.9121951219512194, + "grad_norm": 0.609192967414856, + "learning_rate": 5.013550135501356e-07, + "loss": 0.895, + "mean_token_accuracy": 0.7223051190376282, + "num_tokens": 10759135.0, + "step": 784 + }, + { + "entropy": 0.8828786611557007, + "epoch": 1.9146341463414633, + "grad_norm": 0.5097105503082275, + "learning_rate": 4.878048780487805e-07, + "loss": 0.8647, + "mean_token_accuracy": 0.7296620607376099, + "num_tokens": 10772734.0, + "step": 785 + }, + { + "entropy": 0.8686960339546204, + "epoch": 1.9170731707317072, + "grad_norm": 0.5239219665527344, + "learning_rate": 4.7425474254742553e-07, + "loss": 0.8578, + "mean_token_accuracy": 0.7318073511123657, + "num_tokens": 10786162.0, + "step": 786 + }, + { + "entropy": 0.8628572225570679, + "epoch": 1.9195121951219511, + "grad_norm": 0.5133153200149536, + "learning_rate": 4.607046070460705e-07, + "loss": 0.835, + "mean_token_accuracy": 0.7363176941871643, + "num_tokens": 10799827.0, + "step": 787 + }, + { + "entropy": 0.8920273780822754, + "epoch": 1.921951219512195, + "grad_norm": 0.5285261869430542, + "learning_rate": 4.471544715447155e-07, + "loss": 0.8851, + "mean_token_accuracy": 0.7281093001365662, + "num_tokens": 10813753.0, + "step": 788 + }, + { + "entropy": 0.8801295757293701, + "epoch": 1.924390243902439, + "grad_norm": 0.5520910620689392, + "learning_rate": 4.336043360433605e-07, + "loss": 0.8872, + "mean_token_accuracy": 0.7253822088241577, + "num_tokens": 10827570.0, + "step": 789 + }, + { + "entropy": 0.8774034380912781, + "epoch": 1.9268292682926829, + "grad_norm": 0.5181596279144287, + "learning_rate": 4.2005420054200547e-07, + "loss": 0.8544, + "mean_token_accuracy": 0.7363961935043335, + "num_tokens": 10841277.0, + "step": 790 + }, + { + "entropy": 0.8727174997329712, + "epoch": 1.9292682926829268, + "grad_norm": 0.5676617622375488, + "learning_rate": 4.0650406504065046e-07, + "loss": 0.8475, + "mean_token_accuracy": 0.7332050800323486, + "num_tokens": 10854809.0, + "step": 791 + }, + { + "entropy": 0.8603472113609314, + "epoch": 1.9317073170731707, + "grad_norm": 0.5061153173446655, + "learning_rate": 3.929539295392954e-07, + "loss": 0.8337, + "mean_token_accuracy": 0.7372038960456848, + "num_tokens": 10868208.0, + "step": 792 + }, + { + "entropy": 0.8648924827575684, + "epoch": 1.9341463414634146, + "grad_norm": 0.516345202922821, + "learning_rate": 3.794037940379404e-07, + "loss": 0.8779, + "mean_token_accuracy": 0.7320489287376404, + "num_tokens": 10882290.0, + "step": 793 + }, + { + "entropy": 0.887532651424408, + "epoch": 1.9365853658536585, + "grad_norm": 0.5096654295921326, + "learning_rate": 3.6585365853658536e-07, + "loss": 0.8717, + "mean_token_accuracy": 0.7271854877471924, + "num_tokens": 10895850.0, + "step": 794 + }, + { + "entropy": 0.8896082639694214, + "epoch": 1.9390243902439024, + "grad_norm": 0.5067712664604187, + "learning_rate": 3.5230352303523034e-07, + "loss": 0.8612, + "mean_token_accuracy": 0.7332432866096497, + "num_tokens": 10909935.0, + "step": 795 + }, + { + "entropy": 0.8808606863021851, + "epoch": 1.9414634146341463, + "grad_norm": 0.5321662425994873, + "learning_rate": 3.3875338753387533e-07, + "loss": 0.8754, + "mean_token_accuracy": 0.7316710352897644, + "num_tokens": 10923386.0, + "step": 796 + }, + { + "entropy": 0.882371187210083, + "epoch": 1.9439024390243902, + "grad_norm": 0.6009424924850464, + "learning_rate": 3.2520325203252037e-07, + "loss": 0.8694, + "mean_token_accuracy": 0.7282980680465698, + "num_tokens": 10936258.0, + "step": 797 + }, + { + "entropy": 0.8814548254013062, + "epoch": 1.946341463414634, + "grad_norm": 0.533541202545166, + "learning_rate": 3.1165311653116535e-07, + "loss": 0.8671, + "mean_token_accuracy": 0.732546865940094, + "num_tokens": 10949667.0, + "step": 798 + }, + { + "entropy": 0.8886417150497437, + "epoch": 1.948780487804878, + "grad_norm": 0.540417492389679, + "learning_rate": 2.9810298102981034e-07, + "loss": 0.8919, + "mean_token_accuracy": 0.7254874110221863, + "num_tokens": 10963890.0, + "step": 799 + }, + { + "entropy": 0.8837313652038574, + "epoch": 1.951219512195122, + "grad_norm": 0.5418463349342346, + "learning_rate": 2.845528455284553e-07, + "loss": 0.8618, + "mean_token_accuracy": 0.7310764193534851, + "num_tokens": 10977553.0, + "step": 800 + }, + { + "entropy": 0.8748241662979126, + "epoch": 1.9536585365853658, + "grad_norm": 0.49068158864974976, + "learning_rate": 2.710027100271003e-07, + "loss": 0.8724, + "mean_token_accuracy": 0.7294101119041443, + "num_tokens": 10991605.0, + "step": 801 + }, + { + "entropy": 0.8987839818000793, + "epoch": 1.9560975609756097, + "grad_norm": 0.5240376591682434, + "learning_rate": 2.574525745257453e-07, + "loss": 0.897, + "mean_token_accuracy": 0.7241867184638977, + "num_tokens": 11005054.0, + "step": 802 + }, + { + "entropy": 0.8850489258766174, + "epoch": 1.9585365853658536, + "grad_norm": 0.5221152305603027, + "learning_rate": 2.439024390243903e-07, + "loss": 0.8724, + "mean_token_accuracy": 0.7294400930404663, + "num_tokens": 11018786.0, + "step": 803 + }, + { + "entropy": 0.86225825548172, + "epoch": 1.9609756097560975, + "grad_norm": 0.5298704504966736, + "learning_rate": 2.3035230352303526e-07, + "loss": 0.8594, + "mean_token_accuracy": 0.7317895889282227, + "num_tokens": 11032146.0, + "step": 804 + }, + { + "entropy": 0.8769630193710327, + "epoch": 1.9634146341463414, + "grad_norm": 0.5109516382217407, + "learning_rate": 2.1680216802168024e-07, + "loss": 0.8593, + "mean_token_accuracy": 0.7320917844772339, + "num_tokens": 11045801.0, + "step": 805 + }, + { + "entropy": 0.8887900114059448, + "epoch": 1.9658536585365853, + "grad_norm": 0.4967120289802551, + "learning_rate": 2.0325203252032523e-07, + "loss": 0.8728, + "mean_token_accuracy": 0.733670175075531, + "num_tokens": 11059672.0, + "step": 806 + }, + { + "entropy": 0.8606371879577637, + "epoch": 1.9682926829268292, + "grad_norm": 0.532614529132843, + "learning_rate": 1.897018970189702e-07, + "loss": 0.8465, + "mean_token_accuracy": 0.736371636390686, + "num_tokens": 11073336.0, + "step": 807 + }, + { + "entropy": 0.8847682476043701, + "epoch": 1.9707317073170731, + "grad_norm": 0.5335534811019897, + "learning_rate": 1.7615176151761517e-07, + "loss": 0.8846, + "mean_token_accuracy": 0.7342714667320251, + "num_tokens": 11086783.0, + "step": 808 + }, + { + "entropy": 0.8636947870254517, + "epoch": 1.973170731707317, + "grad_norm": 0.5089491009712219, + "learning_rate": 1.6260162601626018e-07, + "loss": 0.8487, + "mean_token_accuracy": 0.738890528678894, + "num_tokens": 11100571.0, + "step": 809 + }, + { + "entropy": 0.8890416622161865, + "epoch": 1.975609756097561, + "grad_norm": 0.5388813018798828, + "learning_rate": 1.4905149051490517e-07, + "loss": 0.8864, + "mean_token_accuracy": 0.7321293354034424, + "num_tokens": 11113765.0, + "step": 810 + }, + { + "entropy": 0.9044409394264221, + "epoch": 1.9780487804878049, + "grad_norm": 0.5027598738670349, + "learning_rate": 1.3550135501355015e-07, + "loss": 0.8914, + "mean_token_accuracy": 0.7259988784790039, + "num_tokens": 11127697.0, + "step": 811 + }, + { + "entropy": 0.8642138242721558, + "epoch": 1.9804878048780488, + "grad_norm": 0.5696573853492737, + "learning_rate": 1.2195121951219514e-07, + "loss": 0.8505, + "mean_token_accuracy": 0.7409389019012451, + "num_tokens": 11140984.0, + "step": 812 + }, + { + "entropy": 0.8789657354354858, + "epoch": 1.9829268292682927, + "grad_norm": 0.5053057670593262, + "learning_rate": 1.0840108401084012e-07, + "loss": 0.8638, + "mean_token_accuracy": 0.735129714012146, + "num_tokens": 11154954.0, + "step": 813 + }, + { + "entropy": 0.8782299160957336, + "epoch": 1.9853658536585366, + "grad_norm": 0.5316551327705383, + "learning_rate": 9.48509485094851e-08, + "loss": 0.8749, + "mean_token_accuracy": 0.7280279994010925, + "num_tokens": 11168692.0, + "step": 814 + }, + { + "entropy": 0.8809655904769897, + "epoch": 1.9878048780487805, + "grad_norm": 0.5319744944572449, + "learning_rate": 8.130081300813009e-08, + "loss": 0.8773, + "mean_token_accuracy": 0.7296779751777649, + "num_tokens": 11182806.0, + "step": 815 + }, + { + "entropy": 0.874808132648468, + "epoch": 1.9902439024390244, + "grad_norm": 0.5373929738998413, + "learning_rate": 6.775067750677508e-08, + "loss": 0.8537, + "mean_token_accuracy": 0.7337278127670288, + "num_tokens": 11196680.0, + "step": 816 + }, + { + "entropy": 0.8780685663223267, + "epoch": 1.9926829268292683, + "grad_norm": 0.5213566422462463, + "learning_rate": 5.420054200542006e-08, + "loss": 0.86, + "mean_token_accuracy": 0.734074056148529, + "num_tokens": 11210196.0, + "step": 817 + }, + { + "entropy": 0.8767932057380676, + "epoch": 1.9951219512195122, + "grad_norm": 0.5122414827346802, + "learning_rate": 4.0650406504065046e-08, + "loss": 0.8454, + "mean_token_accuracy": 0.7340317368507385, + "num_tokens": 11223958.0, + "step": 818 + }, + { + "entropy": 0.8675880432128906, + "epoch": 1.997560975609756, + "grad_norm": 0.5748156309127808, + "learning_rate": 2.710027100271003e-08, + "loss": 0.8494, + "mean_token_accuracy": 0.7352919578552246, + "num_tokens": 11237453.0, + "step": 819 + }, + { + "entropy": 0.8685159683227539, + "epoch": 2.0, + "grad_norm": 0.535193145275116, + "learning_rate": 1.3550135501355015e-08, + "loss": 0.861, + "mean_token_accuracy": 0.7290802597999573, + "num_tokens": 11251451.0, + "step": 820 + }, + { + "epoch": 2.0, + "step": 820, + "total_flos": 5.656792731347845e+17, + "train_loss": 1.044341852534108, + "train_runtime": 2841.2294, + "train_samples_per_second": 4.611, + "train_steps_per_second": 0.289 + } + ], + "logging_steps": 1, + "max_steps": 820, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.656792731347845e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}