{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 1.8181818181818183e-06, "loss": 8.9259, "step": 1 }, { "epoch": 0.08, "learning_rate": 3.6363636363636366e-06, "loss": 8.9458, "step": 2 }, { "epoch": 0.12, "learning_rate": 5.4545454545454545e-06, "loss": 8.1374, "step": 3 }, { "epoch": 0.17, "learning_rate": 7.272727272727273e-06, "loss": 5.8659, "step": 4 }, { "epoch": 0.21, "learning_rate": 9.090909090909091e-06, "loss": 2.6947, "step": 5 }, { "epoch": 0.25, "learning_rate": 1.0909090909090909e-05, "loss": 1.8902, "step": 6 }, { "epoch": 0.29, "learning_rate": 1.2727272727272728e-05, "loss": 1.907, "step": 7 }, { "epoch": 0.33, "learning_rate": 1.4545454545454546e-05, "loss": 1.6722, "step": 8 }, { "epoch": 0.38, "learning_rate": 1.6363636363636366e-05, "loss": 1.6213, "step": 9 }, { "epoch": 0.42, "learning_rate": 1.8181818181818182e-05, "loss": 1.5757, "step": 10 }, { "epoch": 0.46, "learning_rate": 2e-05, "loss": 1.3937, "step": 11 }, { "epoch": 0.5, "learning_rate": 1.9999594849888083e-05, "loss": 0.9765, "step": 12 }, { "epoch": 0.54, "learning_rate": 1.999837943238166e-05, "loss": 1.2983, "step": 13 }, { "epoch": 0.58, "learning_rate": 1.9996353845966033e-05, "loss": 1.0346, "step": 14 }, { "epoch": 0.62, "learning_rate": 1.9993518254774517e-05, "loss": 1.0499, "step": 15 }, { "epoch": 0.67, "learning_rate": 1.998987288857513e-05, "loss": 1.071, "step": 16 }, { "epoch": 0.71, "learning_rate": 1.9985418042751975e-05, "loss": 1.2056, "step": 17 }, { "epoch": 0.75, "learning_rate": 1.998015407828131e-05, "loss": 0.8992, "step": 18 }, { "epoch": 0.79, "learning_rate": 1.9974081421702296e-05, "loss": 1.1998, "step": 19 }, { "epoch": 0.83, "learning_rate": 1.9967200565082426e-05, "loss": 0.8691, "step": 20 }, { "epoch": 0.88, "learning_rate": 1.9959512065977673e-05, "loss": 0.8735, "step": 21 }, { "epoch": 0.92, "learning_rate": 1.9951016547387286e-05, "loss": 0.7922, "step": 22 }, { "epoch": 0.96, "learning_rate": 1.9941714697703333e-05, "loss": 0.8494, "step": 23 }, { "epoch": 1.0, "learning_rate": 1.993160727065489e-05, "loss": 0.8835, "step": 24 }, { "epoch": 1.04, "learning_rate": 1.9920695085247012e-05, "loss": 0.5345, "step": 25 }, { "epoch": 1.08, "learning_rate": 1.9908979025694312e-05, "loss": 0.4796, "step": 26 }, { "epoch": 1.12, "learning_rate": 1.989646004134937e-05, "loss": 0.5371, "step": 27 }, { "epoch": 1.17, "learning_rate": 1.9883139146625763e-05, "loss": 0.4286, "step": 28 }, { "epoch": 1.21, "learning_rate": 1.9869017420915888e-05, "loss": 0.5679, "step": 29 }, { "epoch": 1.25, "learning_rate": 1.9854096008503495e-05, "loss": 0.5667, "step": 30 }, { "epoch": 1.29, "learning_rate": 1.9838376118470965e-05, "loss": 0.4878, "step": 31 }, { "epoch": 1.33, "learning_rate": 1.9821859024601345e-05, "loss": 0.6535, "step": 32 }, { "epoch": 1.38, "learning_rate": 1.9804546065275116e-05, "loss": 0.566, "step": 33 }, { "epoch": 1.42, "learning_rate": 1.978643864336176e-05, "loss": 0.4362, "step": 34 }, { "epoch": 1.46, "learning_rate": 1.9767538226106078e-05, "loss": 0.6098, "step": 35 }, { "epoch": 1.5, "learning_rate": 1.9747846345009306e-05, "loss": 0.3936, "step": 36 }, { "epoch": 1.54, "learning_rate": 1.9727364595705012e-05, "loss": 0.5268, "step": 37 }, { "epoch": 1.58, "learning_rate": 1.9706094637829797e-05, "loss": 0.5022, "step": 38 }, { "epoch": 1.62, "learning_rate": 1.9684038194888827e-05, "loss": 0.486, "step": 39 }, { "epoch": 1.67, "learning_rate": 1.9661197054116165e-05, "loss": 0.5562, "step": 40 }, { "epoch": 1.71, "learning_rate": 1.963757306632996e-05, "loss": 0.3986, "step": 41 }, { "epoch": 1.75, "learning_rate": 1.9613168145782468e-05, "loss": 0.4909, "step": 42 }, { "epoch": 1.79, "learning_rate": 1.958798427000495e-05, "loss": 0.4447, "step": 43 }, { "epoch": 1.83, "learning_rate": 1.956202347964743e-05, "loss": 0.4243, "step": 44 }, { "epoch": 1.88, "learning_rate": 1.9535287878313315e-05, "loss": 0.4518, "step": 45 }, { "epoch": 1.92, "learning_rate": 1.9507779632388997e-05, "loss": 0.5424, "step": 46 }, { "epoch": 1.96, "learning_rate": 1.947950097086825e-05, "loss": 0.472, "step": 47 }, { "epoch": 2.0, "learning_rate": 1.945045418517165e-05, "loss": 0.4739, "step": 48 }, { "epoch": 2.04, "learning_rate": 1.9420641628960897e-05, "loss": 0.3188, "step": 49 }, { "epoch": 2.08, "learning_rate": 1.9390065717948084e-05, "loss": 0.3718, "step": 50 }, { "epoch": 2.12, "learning_rate": 1.9358728929699966e-05, "loss": 0.2013, "step": 51 }, { "epoch": 2.17, "learning_rate": 1.9326633803437197e-05, "loss": 0.2719, "step": 52 }, { "epoch": 2.21, "learning_rate": 1.929378293982857e-05, "loss": 0.2863, "step": 53 }, { "epoch": 2.25, "learning_rate": 1.926017900078031e-05, "loss": 0.2934, "step": 54 }, { "epoch": 2.29, "learning_rate": 1.922582470922034e-05, "loss": 0.2446, "step": 55 }, { "epoch": 2.33, "learning_rate": 1.9190722848877683e-05, "loss": 0.2367, "step": 56 }, { "epoch": 2.38, "learning_rate": 1.9154876264056863e-05, "loss": 0.2255, "step": 57 }, { "epoch": 2.42, "learning_rate": 1.911828785940745e-05, "loss": 0.2965, "step": 58 }, { "epoch": 2.46, "learning_rate": 1.908096059968869e-05, "loss": 0.2464, "step": 59 }, { "epoch": 2.5, "learning_rate": 1.904289750952928e-05, "loss": 0.2446, "step": 60 }, { "epoch": 2.54, "learning_rate": 1.900410167318226e-05, "loss": 0.2948, "step": 61 }, { "epoch": 2.58, "learning_rate": 1.8964576234275123e-05, "loss": 0.2947, "step": 62 }, { "epoch": 2.62, "learning_rate": 1.8924324395555066e-05, "loss": 0.2137, "step": 63 }, { "epoch": 2.67, "learning_rate": 1.8883349418629487e-05, "loss": 0.2963, "step": 64 }, { "epoch": 2.71, "learning_rate": 1.8841654623701673e-05, "loss": 0.319, "step": 65 }, { "epoch": 2.75, "learning_rate": 1.8799243389301796e-05, "loss": 0.2211, "step": 66 }, { "epoch": 2.79, "learning_rate": 1.8756119152013134e-05, "loss": 0.2289, "step": 67 }, { "epoch": 2.83, "learning_rate": 1.8712285406193585e-05, "loss": 0.2706, "step": 68 }, { "epoch": 2.88, "learning_rate": 1.866774570369257e-05, "loss": 0.2301, "step": 69 }, { "epoch": 2.92, "learning_rate": 1.8622503653563173e-05, "loss": 0.2522, "step": 70 }, { "epoch": 2.96, "learning_rate": 1.8576562921769727e-05, "loss": 0.2784, "step": 71 }, { "epoch": 3.0, "learning_rate": 1.8529927230890757e-05, "loss": 0.3307, "step": 72 }, { "epoch": 3.04, "learning_rate": 1.8482600359817344e-05, "loss": 0.1112, "step": 73 }, { "epoch": 3.08, "learning_rate": 1.843458614344691e-05, "loss": 0.1366, "step": 74 }, { "epoch": 3.12, "learning_rate": 1.8385888472372474e-05, "loss": 0.1771, "step": 75 }, { "epoch": 3.17, "learning_rate": 1.833651129256742e-05, "loss": 0.1372, "step": 76 }, { "epoch": 3.21, "learning_rate": 1.828645860506573e-05, "loss": 0.1824, "step": 77 }, { "epoch": 3.25, "learning_rate": 1.8235734465637794e-05, "loss": 0.1933, "step": 78 }, { "epoch": 3.29, "learning_rate": 1.8184342984461766e-05, "loss": 0.2282, "step": 79 }, { "epoch": 3.33, "learning_rate": 1.8132288325790518e-05, "loss": 0.1289, "step": 80 }, { "epoch": 3.38, "learning_rate": 1.8079574707614202e-05, "loss": 0.1726, "step": 81 }, { "epoch": 3.42, "learning_rate": 1.802620640131848e-05, "loss": 0.1802, "step": 82 }, { "epoch": 3.46, "learning_rate": 1.797218773133841e-05, "loss": 0.1267, "step": 83 }, { "epoch": 3.5, "learning_rate": 1.7917523074808024e-05, "loss": 0.1633, "step": 84 }, { "epoch": 3.54, "learning_rate": 1.786221686120567e-05, "loss": 0.1657, "step": 85 }, { "epoch": 3.58, "learning_rate": 1.7806273571995066e-05, "loss": 0.1596, "step": 86 }, { "epoch": 3.62, "learning_rate": 1.7749697740262197e-05, "loss": 0.1457, "step": 87 }, { "epoch": 3.67, "learning_rate": 1.769249395034797e-05, "loss": 0.1333, "step": 88 }, { "epoch": 3.71, "learning_rate": 1.7634666837476765e-05, "loss": 0.2186, "step": 89 }, { "epoch": 3.75, "learning_rate": 1.757622108738083e-05, "loss": 0.1399, "step": 90 }, { "epoch": 3.79, "learning_rate": 1.7517161435920606e-05, "loss": 0.1465, "step": 91 }, { "epoch": 3.83, "learning_rate": 1.7457492668700967e-05, "loss": 0.1402, "step": 92 }, { "epoch": 3.88, "learning_rate": 1.7397219620683465e-05, "loss": 0.2108, "step": 93 }, { "epoch": 3.92, "learning_rate": 1.7336347175794523e-05, "loss": 0.1197, "step": 94 }, { "epoch": 3.96, "learning_rate": 1.7274880266529716e-05, "loss": 0.1889, "step": 95 }, { "epoch": 4.0, "learning_rate": 1.721282387355408e-05, "loss": 0.1857, "step": 96 }, { "epoch": 4.04, "learning_rate": 1.715018302529852e-05, "loss": 0.0949, "step": 97 }, { "epoch": 4.08, "learning_rate": 1.7086962797552376e-05, "loss": 0.102, "step": 98 }, { "epoch": 4.12, "learning_rate": 1.7023168313052118e-05, "loss": 0.07, "step": 99 }, { "epoch": 4.17, "learning_rate": 1.6958804741066254e-05, "loss": 0.0904, "step": 100 }, { "epoch": 4.21, "learning_rate": 1.689387729697646e-05, "loss": 0.1163, "step": 101 }, { "epoch": 4.25, "learning_rate": 1.6828391241854983e-05, "loss": 0.0992, "step": 102 }, { "epoch": 4.29, "learning_rate": 1.6762351882038342e-05, "loss": 0.1115, "step": 103 }, { "epoch": 4.33, "learning_rate": 1.669576456869733e-05, "loss": 0.0796, "step": 104 }, { "epoch": 4.38, "learning_rate": 1.6628634697403447e-05, "loss": 0.0835, "step": 105 }, { "epoch": 4.42, "learning_rate": 1.6560967707691663e-05, "loss": 0.1216, "step": 106 }, { "epoch": 4.46, "learning_rate": 1.649276908261967e-05, "loss": 0.0637, "step": 107 }, { "epoch": 4.5, "learning_rate": 1.642404434832358e-05, "loss": 0.138, "step": 108 }, { "epoch": 4.54, "learning_rate": 1.635479907357016e-05, "loss": 0.1041, "step": 109 }, { "epoch": 4.58, "learning_rate": 1.6285038869305565e-05, "loss": 0.0871, "step": 110 }, { "epoch": 4.62, "learning_rate": 1.621476938820071e-05, "loss": 0.0819, "step": 111 }, { "epoch": 4.67, "learning_rate": 1.6143996324193227e-05, "loss": 0.034, "step": 112 }, { "epoch": 4.71, "learning_rate": 1.6072725412026066e-05, "loss": 0.0366, "step": 113 }, { "epoch": 4.75, "learning_rate": 1.6000962426782844e-05, "loss": 0.0967, "step": 114 }, { "epoch": 4.79, "learning_rate": 1.592871318341986e-05, "loss": 0.1434, "step": 115 }, { "epoch": 4.83, "learning_rate": 1.585598353629492e-05, "loss": 0.1013, "step": 116 }, { "epoch": 4.88, "learning_rate": 1.5782779378692957e-05, "loss": 0.1169, "step": 117 }, { "epoch": 4.92, "learning_rate": 1.57091066423485e-05, "loss": 0.0837, "step": 118 }, { "epoch": 4.96, "learning_rate": 1.5634971296965027e-05, "loss": 0.1023, "step": 119 }, { "epoch": 5.0, "learning_rate": 1.5560379349731234e-05, "loss": 0.1044, "step": 120 }, { "epoch": 5.04, "learning_rate": 1.5485336844834274e-05, "loss": 0.0449, "step": 121 }, { "epoch": 5.08, "learning_rate": 1.5409849862969994e-05, "loss": 0.0338, "step": 122 }, { "epoch": 5.12, "learning_rate": 1.5333924520850227e-05, "loss": 0.0379, "step": 123 }, { "epoch": 5.17, "learning_rate": 1.5257566970707147e-05, "loss": 0.0553, "step": 124 }, { "epoch": 5.21, "learning_rate": 1.5180783399794749e-05, "loss": 0.0408, "step": 125 }, { "epoch": 5.25, "learning_rate": 1.5103580029887504e-05, "loss": 0.0688, "step": 126 }, { "epoch": 5.29, "learning_rate": 1.5025963116776203e-05, "loss": 0.0781, "step": 127 }, { "epoch": 5.33, "learning_rate": 1.4947938949761054e-05, "loss": 0.0799, "step": 128 }, { "epoch": 5.38, "learning_rate": 1.4869513851142051e-05, "loss": 0.0328, "step": 129 }, { "epoch": 5.42, "learning_rate": 1.4790694175706698e-05, "loss": 0.0869, "step": 130 }, { "epoch": 5.46, "learning_rate": 1.4711486310215053e-05, "loss": 0.0185, "step": 131 }, { "epoch": 5.5, "learning_rate": 1.4631896672882235e-05, "loss": 0.0751, "step": 132 }, { "epoch": 5.54, "learning_rate": 1.4551931712858334e-05, "loss": 0.0842, "step": 133 }, { "epoch": 5.58, "learning_rate": 1.4471597909705858e-05, "loss": 0.0718, "step": 134 }, { "epoch": 5.62, "learning_rate": 1.4390901772874668e-05, "loss": 0.0854, "step": 135 }, { "epoch": 5.67, "learning_rate": 1.4309849841174538e-05, "loss": 0.0508, "step": 136 }, { "epoch": 5.71, "learning_rate": 1.422844868224531e-05, "loss": 0.0722, "step": 137 }, { "epoch": 5.75, "learning_rate": 1.4146704892024714e-05, "loss": 0.0577, "step": 138 }, { "epoch": 5.79, "learning_rate": 1.40646250942139e-05, "loss": 0.0654, "step": 139 }, { "epoch": 5.83, "learning_rate": 1.3982215939740726e-05, "loss": 0.0452, "step": 140 }, { "epoch": 5.88, "learning_rate": 1.3899484106220816e-05, "loss": 0.0549, "step": 141 }, { "epoch": 5.92, "learning_rate": 1.3816436297416496e-05, "loss": 0.0385, "step": 142 }, { "epoch": 5.96, "learning_rate": 1.3733079242693572e-05, "loss": 0.0729, "step": 143 }, { "epoch": 6.0, "learning_rate": 1.3649419696476057e-05, "loss": 0.041, "step": 144 }, { "epoch": 6.04, "learning_rate": 1.356546443769885e-05, "loss": 0.02, "step": 145 }, { "epoch": 6.08, "learning_rate": 1.3481220269258449e-05, "loss": 0.0611, "step": 146 }, { "epoch": 6.12, "learning_rate": 1.3396694017461708e-05, "loss": 0.0374, "step": 147 }, { "epoch": 6.17, "learning_rate": 1.3311892531472705e-05, "loss": 0.0294, "step": 148 }, { "epoch": 6.21, "learning_rate": 1.3226822682757745e-05, "loss": 0.0483, "step": 149 }, { "epoch": 6.25, "learning_rate": 1.3141491364528576e-05, "loss": 0.0242, "step": 150 }, { "epoch": 6.29, "learning_rate": 1.3055905491183822e-05, "loss": 0.0307, "step": 151 }, { "epoch": 6.33, "learning_rate": 1.2970071997748712e-05, "loss": 0.0529, "step": 152 }, { "epoch": 6.38, "learning_rate": 1.288399783931315e-05, "loss": 0.0542, "step": 153 }, { "epoch": 6.42, "learning_rate": 1.2797689990468113e-05, "loss": 0.0297, "step": 154 }, { "epoch": 6.46, "learning_rate": 1.2711155444740529e-05, "loss": 0.0292, "step": 155 }, { "epoch": 6.5, "learning_rate": 1.2624401214026574e-05, "loss": 0.0176, "step": 156 }, { "epoch": 6.54, "learning_rate": 1.2537434328023501e-05, "loss": 0.0154, "step": 157 }, { "epoch": 6.58, "learning_rate": 1.2450261833660033e-05, "loss": 0.02, "step": 158 }, { "epoch": 6.62, "learning_rate": 1.2362890794525342e-05, "loss": 0.0589, "step": 159 }, { "epoch": 6.67, "learning_rate": 1.2275328290296677e-05, "loss": 0.0283, "step": 160 }, { "epoch": 6.71, "learning_rate": 1.2187581416165721e-05, "loss": 0.0307, "step": 161 }, { "epoch": 6.75, "learning_rate": 1.2099657282263651e-05, "loss": 0.0356, "step": 162 }, { "epoch": 6.79, "learning_rate": 1.2011563013084996e-05, "loss": 0.0466, "step": 163 }, { "epoch": 6.83, "learning_rate": 1.1923305746910372e-05, "loss": 0.0354, "step": 164 }, { "epoch": 6.88, "learning_rate": 1.1834892635228024e-05, "loss": 0.0432, "step": 165 }, { "epoch": 6.92, "learning_rate": 1.1746330842154371e-05, "loss": 0.0324, "step": 166 }, { "epoch": 6.96, "learning_rate": 1.1657627543853491e-05, "loss": 0.0406, "step": 167 }, { "epoch": 7.0, "learning_rate": 1.156878992795563e-05, "loss": 0.0394, "step": 168 }, { "epoch": 7.04, "learning_rate": 1.1479825192974791e-05, "loss": 0.0234, "step": 169 }, { "epoch": 7.08, "learning_rate": 1.1390740547725443e-05, "loss": 0.0135, "step": 170 }, { "epoch": 7.12, "learning_rate": 1.1301543210738383e-05, "loss": 0.0438, "step": 171 }, { "epoch": 7.17, "learning_rate": 1.1212240409675825e-05, "loss": 0.015, "step": 172 }, { "epoch": 7.21, "learning_rate": 1.1122839380745738e-05, "loss": 0.0083, "step": 173 }, { "epoch": 7.25, "learning_rate": 1.1033347368115494e-05, "loss": 0.0212, "step": 174 }, { "epoch": 7.29, "learning_rate": 1.0943771623324884e-05, "loss": 0.0245, "step": 175 }, { "epoch": 7.33, "learning_rate": 1.085411940469851e-05, "loss": 0.0083, "step": 176 }, { "epoch": 7.38, "learning_rate": 1.0764397976757658e-05, "loss": 0.0187, "step": 177 }, { "epoch": 7.42, "learning_rate": 1.0674614609631634e-05, "loss": 0.0212, "step": 178 }, { "epoch": 7.46, "learning_rate": 1.0584776578468698e-05, "loss": 0.022, "step": 179 }, { "epoch": 7.5, "learning_rate": 1.0494891162846515e-05, "loss": 0.0374, "step": 180 }, { "epoch": 7.54, "learning_rate": 1.040496564618233e-05, "loss": 0.014, "step": 181 }, { "epoch": 7.58, "learning_rate": 1.0315007315142772e-05, "loss": 0.0146, "step": 182 }, { "epoch": 7.62, "learning_rate": 1.0225023459053416e-05, "loss": 0.0226, "step": 183 }, { "epoch": 7.67, "learning_rate": 1.0135021369308138e-05, "loss": 0.0176, "step": 184 }, { "epoch": 7.71, "learning_rate": 1.004500833877828e-05, "loss": 0.0101, "step": 185 }, { "epoch": 7.75, "learning_rate": 9.954991661221724e-06, "loss": 0.0093, "step": 186 }, { "epoch": 7.79, "learning_rate": 9.864978630691865e-06, "loss": 0.0228, "step": 187 }, { "epoch": 7.83, "learning_rate": 9.774976540946589e-06, "loss": 0.0194, "step": 188 }, { "epoch": 7.88, "learning_rate": 9.684992684857232e-06, "loss": 0.0445, "step": 189 }, { "epoch": 7.92, "learning_rate": 9.595034353817673e-06, "loss": 0.0078, "step": 190 }, { "epoch": 7.96, "learning_rate": 9.505108837153489e-06, "loss": 0.0227, "step": 191 }, { "epoch": 8.0, "learning_rate": 9.415223421531308e-06, "loss": 0.0188, "step": 192 }, { "epoch": 8.04, "learning_rate": 9.325385390368367e-06, "loss": 0.0114, "step": 193 }, { "epoch": 8.08, "learning_rate": 9.23560202324235e-06, "loss": 0.0084, "step": 194 }, { "epoch": 8.12, "learning_rate": 9.145880595301495e-06, "loss": 0.0092, "step": 195 }, { "epoch": 8.17, "learning_rate": 9.056228376675118e-06, "loss": 0.0158, "step": 196 }, { "epoch": 8.21, "learning_rate": 8.966652631884506e-06, "loss": 0.0045, "step": 197 }, { "epoch": 8.25, "learning_rate": 8.877160619254264e-06, "loss": 0.0148, "step": 198 }, { "epoch": 8.29, "learning_rate": 8.787759590324177e-06, "loss": 0.0076, "step": 199 }, { "epoch": 8.33, "learning_rate": 8.698456789261617e-06, "loss": 0.0086, "step": 200 }, { "epoch": 8.38, "learning_rate": 8.609259452274559e-06, "loss": 0.0121, "step": 201 }, { "epoch": 8.42, "learning_rate": 8.52017480702521e-06, "loss": 0.01, "step": 202 }, { "epoch": 8.46, "learning_rate": 8.431210072044371e-06, "loss": 0.0094, "step": 203 }, { "epoch": 8.5, "learning_rate": 8.342372456146512e-06, "loss": 0.0036, "step": 204 }, { "epoch": 8.54, "learning_rate": 8.253669157845632e-06, "loss": 0.0123, "step": 205 }, { "epoch": 8.58, "learning_rate": 8.165107364771979e-06, "loss": 0.0262, "step": 206 }, { "epoch": 8.62, "learning_rate": 8.076694253089632e-06, "loss": 0.0409, "step": 207 }, { "epoch": 8.67, "learning_rate": 7.988436986915005e-06, "loss": 0.0219, "step": 208 }, { "epoch": 8.71, "learning_rate": 7.900342717736354e-06, "loss": 0.0114, "step": 209 }, { "epoch": 8.75, "learning_rate": 7.812418583834282e-06, "loss": 0.0184, "step": 210 }, { "epoch": 8.79, "learning_rate": 7.724671709703328e-06, "loss": 0.0151, "step": 211 }, { "epoch": 8.83, "learning_rate": 7.637109205474665e-06, "loss": 0.0371, "step": 212 }, { "epoch": 8.88, "learning_rate": 7.5497381663399716e-06, "loss": 0.0111, "step": 213 }, { "epoch": 8.92, "learning_rate": 7.462565671976504e-06, "loss": 0.0236, "step": 214 }, { "epoch": 8.96, "learning_rate": 7.375598785973429e-06, "loss": 0.0131, "step": 215 }, { "epoch": 9.0, "learning_rate": 7.288844555259471e-06, "loss": 0.0104, "step": 216 }, { "epoch": 9.04, "learning_rate": 7.202310009531886e-06, "loss": 0.0021, "step": 217 }, { "epoch": 9.08, "learning_rate": 7.116002160686851e-06, "loss": 0.0202, "step": 218 }, { "epoch": 9.12, "learning_rate": 7.0299280022512875e-06, "loss": 0.0089, "step": 219 }, { "epoch": 9.17, "learning_rate": 6.944094508816182e-06, "loss": 0.0027, "step": 220 }, { "epoch": 9.21, "learning_rate": 6.858508635471428e-06, "loss": 0.0009, "step": 221 }, { "epoch": 9.25, "learning_rate": 6.773177317242257e-06, "loss": 0.0061, "step": 222 }, { "epoch": 9.29, "learning_rate": 6.688107468527297e-06, "loss": 0.0023, "step": 223 }, { "epoch": 9.33, "learning_rate": 6.603305982538295e-06, "loss": 0.0144, "step": 224 }, { "epoch": 9.38, "learning_rate": 6.518779730741555e-06, "loss": 0.0145, "step": 225 }, { "epoch": 9.42, "learning_rate": 6.434535562301153e-06, "loss": 0.01, "step": 226 }, { "epoch": 9.46, "learning_rate": 6.350580303523947e-06, "loss": 0.007, "step": 227 }, { "epoch": 9.5, "learning_rate": 6.266920757306429e-06, "loss": 0.0065, "step": 228 }, { "epoch": 9.54, "learning_rate": 6.183563702583506e-06, "loss": 0.0218, "step": 229 }, { "epoch": 9.58, "learning_rate": 6.100515893779188e-06, "loss": 0.0089, "step": 230 }, { "epoch": 9.62, "learning_rate": 6.01778406025928e-06, "loss": 0.0153, "step": 231 }, { "epoch": 9.67, "learning_rate": 5.935374905786102e-06, "loss": 0.0106, "step": 232 }, { "epoch": 9.71, "learning_rate": 5.8532951079752895e-06, "loss": 0.0111, "step": 233 }, { "epoch": 9.75, "learning_rate": 5.771551317754691e-06, "loss": 0.0289, "step": 234 }, { "epoch": 9.79, "learning_rate": 5.690150158825462e-06, "loss": 0.0136, "step": 235 }, { "epoch": 9.83, "learning_rate": 5.609098227125334e-06, "loss": 0.0087, "step": 236 }, { "epoch": 9.88, "learning_rate": 5.528402090294142e-06, "loss": 0.0147, "step": 237 }, { "epoch": 9.92, "learning_rate": 5.448068287141663e-06, "loss": 0.0122, "step": 238 }, { "epoch": 9.96, "learning_rate": 5.368103327117768e-06, "loss": 0.0181, "step": 239 }, { "epoch": 10.0, "learning_rate": 5.288513689784951e-06, "loss": 0.0142, "step": 240 }, { "epoch": 10.04, "learning_rate": 5.209305824293307e-06, "loss": 0.0038, "step": 241 }, { "epoch": 10.08, "learning_rate": 5.130486148857952e-06, "loss": 0.0144, "step": 242 }, { "epoch": 10.12, "learning_rate": 5.05206105023895e-06, "loss": 0.0034, "step": 243 }, { "epoch": 10.17, "learning_rate": 4.974036883223798e-06, "loss": 0.0013, "step": 244 }, { "epoch": 10.21, "learning_rate": 4.896419970112499e-06, "loss": 0.0083, "step": 245 }, { "epoch": 10.25, "learning_rate": 4.819216600205254e-06, "loss": 0.0194, "step": 246 }, { "epoch": 10.29, "learning_rate": 4.742433029292856e-06, "loss": 0.0019, "step": 247 }, { "epoch": 10.33, "learning_rate": 4.6660754791497755e-06, "loss": 0.0074, "step": 248 }, { "epoch": 10.38, "learning_rate": 4.590150137030009e-06, "loss": 0.0013, "step": 249 }, { "epoch": 10.42, "learning_rate": 4.514663155165731e-06, "loss": 0.0098, "step": 250 }, { "epoch": 10.46, "learning_rate": 4.439620650268771e-06, "loss": 0.0016, "step": 251 }, { "epoch": 10.5, "learning_rate": 4.365028703034976e-06, "loss": 0.0037, "step": 252 }, { "epoch": 10.54, "learning_rate": 4.290893357651502e-06, "loss": 0.0056, "step": 253 }, { "epoch": 10.58, "learning_rate": 4.217220621307043e-06, "loss": 0.0148, "step": 254 }, { "epoch": 10.62, "learning_rate": 4.144016463705081e-06, "loss": 0.0028, "step": 255 }, { "epoch": 10.67, "learning_rate": 4.071286816580142e-06, "loss": 0.0033, "step": 256 }, { "epoch": 10.71, "learning_rate": 3.999037573217157e-06, "loss": 0.0117, "step": 257 }, { "epoch": 10.75, "learning_rate": 3.927274587973935e-06, "loss": 0.0108, "step": 258 }, { "epoch": 10.79, "learning_rate": 3.856003675806777e-06, "loss": 0.0096, "step": 259 }, { "epoch": 10.83, "learning_rate": 3.78523061179929e-06, "loss": 0.0225, "step": 260 }, { "epoch": 10.88, "learning_rate": 3.7149611306944356e-06, "loss": 0.0192, "step": 261 }, { "epoch": 10.92, "learning_rate": 3.645200926429844e-06, "loss": 0.0096, "step": 262 }, { "epoch": 10.96, "learning_rate": 3.5759556516764205e-06, "loss": 0.0045, "step": 263 }, { "epoch": 11.0, "learning_rate": 3.507230917380332e-06, "loss": 0.0265, "step": 264 }, { "epoch": 11.04, "learning_rate": 3.4390322923083385e-06, "loss": 0.0033, "step": 265 }, { "epoch": 11.08, "learning_rate": 3.3713653025965544e-06, "loss": 0.0111, "step": 266 }, { "epoch": 11.12, "learning_rate": 3.3042354313026702e-06, "loss": 0.0106, "step": 267 }, { "epoch": 11.17, "learning_rate": 3.237648117961665e-06, "loss": 0.0172, "step": 268 }, { "epoch": 11.21, "learning_rate": 3.1716087581450193e-06, "loss": 0.0027, "step": 269 }, { "epoch": 11.25, "learning_rate": 3.1061227030235442e-06, "loss": 0.0074, "step": 270 }, { "epoch": 11.29, "learning_rate": 3.041195258933749e-06, "loss": 0.0037, "step": 271 }, { "epoch": 11.33, "learning_rate": 2.976831686947884e-06, "loss": 0.0006, "step": 272 }, { "epoch": 11.38, "learning_rate": 2.913037202447625e-06, "loss": 0.0053, "step": 273 }, { "epoch": 11.42, "learning_rate": 2.8498169747014824e-06, "loss": 0.0049, "step": 274 }, { "epoch": 11.46, "learning_rate": 2.787176126445923e-06, "loss": 0.0098, "step": 275 }, { "epoch": 11.5, "learning_rate": 2.725119733470284e-06, "loss": 0.0064, "step": 276 }, { "epoch": 11.54, "learning_rate": 2.663652824205476e-06, "loss": 0.0051, "step": 277 }, { "epoch": 11.58, "learning_rate": 2.6027803793165353e-06, "loss": 0.007, "step": 278 }, { "epoch": 11.62, "learning_rate": 2.5425073312990334e-06, "loss": 0.0121, "step": 279 }, { "epoch": 11.67, "learning_rate": 2.4828385640793974e-06, "loss": 0.0055, "step": 280 }, { "epoch": 11.71, "learning_rate": 2.4237789126191715e-06, "loss": 0.0035, "step": 281 }, { "epoch": 11.75, "learning_rate": 2.3653331625232367e-06, "loss": 0.004, "step": 282 }, { "epoch": 11.79, "learning_rate": 2.307506049652031e-06, "loss": 0.0114, "step": 283 }, { "epoch": 11.83, "learning_rate": 2.250302259737803e-06, "loss": 0.0004, "step": 284 }, { "epoch": 11.88, "learning_rate": 2.1937264280049365e-06, "loss": 0.0247, "step": 285 }, { "epoch": 11.92, "learning_rate": 2.137783138794335e-06, "loss": 0.0128, "step": 286 }, { "epoch": 11.96, "learning_rate": 2.0824769251919775e-06, "loss": 0.0015, "step": 287 }, { "epoch": 12.0, "learning_rate": 2.027812268661592e-06, "loss": 0.0182, "step": 288 }, { "epoch": 12.04, "learning_rate": 1.9737935986815205e-06, "loss": 0.004, "step": 289 }, { "epoch": 12.08, "learning_rate": 1.9204252923858003e-06, "loss": 0.0028, "step": 290 }, { "epoch": 12.12, "learning_rate": 1.8677116742094858e-06, "loss": 0.0056, "step": 291 }, { "epoch": 12.17, "learning_rate": 1.8156570155382357e-06, "loss": 0.0048, "step": 292 }, { "epoch": 12.21, "learning_rate": 1.764265534362205e-06, "loss": 0.0125, "step": 293 }, { "epoch": 12.25, "learning_rate": 1.7135413949342706e-06, "loss": 0.0017, "step": 294 }, { "epoch": 12.29, "learning_rate": 1.6634887074325844e-06, "loss": 0.0135, "step": 295 }, { "epoch": 12.33, "learning_rate": 1.6141115276275298e-06, "loss": 0.0093, "step": 296 }, { "epoch": 12.38, "learning_rate": 1.565413856553095e-06, "loss": 0.0095, "step": 297 }, { "epoch": 12.42, "learning_rate": 1.5173996401826563e-06, "loss": 0.0082, "step": 298 }, { "epoch": 12.46, "learning_rate": 1.470072769109242e-06, "loss": 0.0089, "step": 299 }, { "epoch": 12.5, "learning_rate": 1.4234370782302742e-06, "loss": 0.0023, "step": 300 }, { "epoch": 12.54, "learning_rate": 1.3774963464368295e-06, "loss": 0.0118, "step": 301 }, { "epoch": 12.58, "learning_rate": 1.3322542963074314e-06, "loss": 0.0053, "step": 302 }, { "epoch": 12.62, "learning_rate": 1.287714593806415e-06, "loss": 0.0128, "step": 303 }, { "epoch": 12.67, "learning_rate": 1.2438808479868715e-06, "loss": 0.0042, "step": 304 }, { "epoch": 12.71, "learning_rate": 1.200756610698205e-06, "loss": 0.0132, "step": 305 }, { "epoch": 12.75, "learning_rate": 1.1583453762983289e-06, "loss": 0.0093, "step": 306 }, { "epoch": 12.79, "learning_rate": 1.1166505813705187e-06, "loss": 0.0095, "step": 307 }, { "epoch": 12.83, "learning_rate": 1.0756756044449358e-06, "loss": 0.0043, "step": 308 }, { "epoch": 12.88, "learning_rate": 1.035423765724879e-06, "loss": 0.0129, "step": 309 }, { "epoch": 12.92, "learning_rate": 9.958983268177425e-07, "loss": 0.0118, "step": 310 }, { "epoch": 12.96, "learning_rate": 9.571024904707238e-07, "loss": 0.0036, "step": 311 }, { "epoch": 13.0, "learning_rate": 9.190394003113123e-07, "loss": 0.0034, "step": 312 }, { "epoch": 13.04, "learning_rate": 8.817121405925543e-07, "loss": 0.0017, "step": 313 }, { "epoch": 13.08, "learning_rate": 8.451237359431397e-07, "loss": 0.0046, "step": 314 }, { "epoch": 13.12, "learning_rate": 8.092771511223185e-07, "loss": 0.0003, "step": 315 }, { "epoch": 13.17, "learning_rate": 7.741752907796584e-07, "loss": 0.0048, "step": 316 }, { "epoch": 13.21, "learning_rate": 7.398209992196914e-07, "loss": 0.0085, "step": 317 }, { "epoch": 13.25, "learning_rate": 7.062170601714302e-07, "loss": 0.0036, "step": 318 }, { "epoch": 13.29, "learning_rate": 6.73366196562808e-07, "loss": 0.0027, "step": 319 }, { "epoch": 13.33, "learning_rate": 6.412710703000368e-07, "loss": 0.01, "step": 320 }, { "epoch": 13.38, "learning_rate": 6.099342820519183e-07, "loss": 0.0037, "step": 321 }, { "epoch": 13.42, "learning_rate": 5.79358371039106e-07, "loss": 0.0068, "step": 322 }, { "epoch": 13.46, "learning_rate": 5.495458148283505e-07, "loss": 0.0019, "step": 323 }, { "epoch": 13.5, "learning_rate": 5.204990291317535e-07, "loss": 0.0031, "step": 324 }, { "epoch": 13.54, "learning_rate": 4.92220367611006e-07, "loss": 0.0131, "step": 325 }, { "epoch": 13.58, "learning_rate": 4.647121216866857e-07, "loss": 0.0162, "step": 326 }, { "epoch": 13.62, "learning_rate": 4.3797652035257544e-07, "loss": 0.0021, "step": 327 }, { "epoch": 13.67, "learning_rate": 4.1201572999505e-07, "loss": 0.0101, "step": 328 }, { "epoch": 13.71, "learning_rate": 3.8683185421753313e-07, "loss": 0.0137, "step": 329 }, { "epoch": 13.75, "learning_rate": 3.6242693367004365e-07, "loss": 0.0034, "step": 330 }, { "epoch": 13.79, "learning_rate": 3.38802945883836e-07, "loss": 0.0039, "step": 331 }, { "epoch": 13.83, "learning_rate": 3.1596180511117235e-07, "loss": 0.0086, "step": 332 }, { "epoch": 13.88, "learning_rate": 2.939053621702015e-07, "loss": 0.0113, "step": 333 }, { "epoch": 13.92, "learning_rate": 2.7263540429498747e-07, "loss": 0.0192, "step": 334 }, { "epoch": 13.96, "learning_rate": 2.5215365499069446e-07, "loss": 0.006, "step": 335 }, { "epoch": 14.0, "learning_rate": 2.3246177389392388e-07, "loss": 0.0024, "step": 336 }, { "epoch": 14.04, "learning_rate": 2.1356135663824328e-07, "loss": 0.0003, "step": 337 }, { "epoch": 14.08, "learning_rate": 1.9545393472488738e-07, "loss": 0.0011, "step": 338 }, { "epoch": 14.12, "learning_rate": 1.7814097539865626e-07, "loss": 0.0018, "step": 339 }, { "epoch": 14.17, "learning_rate": 1.6162388152903498e-07, "loss": 0.0058, "step": 340 }, { "epoch": 14.21, "learning_rate": 1.4590399149650769e-07, "loss": 0.0052, "step": 341 }, { "epoch": 14.25, "learning_rate": 1.309825790841146e-07, "loss": 0.006, "step": 342 }, { "epoch": 14.29, "learning_rate": 1.1686085337423991e-07, "loss": 0.0049, "step": 343 }, { "epoch": 14.33, "learning_rate": 1.0353995865063138e-07, "loss": 0.004, "step": 344 }, { "epoch": 14.38, "learning_rate": 9.10209743056889e-08, "loss": 0.009, "step": 345 }, { "epoch": 14.42, "learning_rate": 7.930491475299229e-08, "loss": 0.0017, "step": 346 }, { "epoch": 14.46, "learning_rate": 6.839272934511143e-08, "loss": 0.0021, "step": 347 }, { "epoch": 14.5, "learning_rate": 5.828530229667228e-08, "loss": 0.0064, "step": 348 }, { "epoch": 14.54, "learning_rate": 4.898345261271531e-08, "loss": 0.0092, "step": 349 }, { "epoch": 14.58, "learning_rate": 4.0487934022328533e-08, "loss": 0.0002, "step": 350 }, { "epoch": 14.62, "learning_rate": 3.27994349175742e-08, "loss": 0.0116, "step": 351 }, { "epoch": 14.67, "learning_rate": 2.591857829770672e-08, "loss": 0.0022, "step": 352 }, { "epoch": 14.71, "learning_rate": 1.984592171869082e-08, "loss": 0.0142, "step": 353 }, { "epoch": 14.75, "learning_rate": 1.4581957248026579e-08, "loss": 0.0068, "step": 354 }, { "epoch": 14.79, "learning_rate": 1.0127111424872437e-08, "loss": 0.0046, "step": 355 }, { "epoch": 14.83, "learning_rate": 6.481745225485059e-09, "loss": 0.0054, "step": 356 }, { "epoch": 14.88, "learning_rate": 3.6461540339682855e-09, "loss": 0.0138, "step": 357 }, { "epoch": 14.92, "learning_rate": 1.6205676183411733e-09, "loss": 0.0125, "step": 358 }, { "epoch": 14.96, "learning_rate": 4.0515011191621933e-10, "loss": 0.0022, "step": 359 }, { "epoch": 15.0, "learning_rate": 0.0, "loss": 0.0143, "step": 360 }, { "epoch": 15.0, "step": 360, "total_flos": 1.427028711613399e+17, "train_loss": 0.004024597113311756, "train_runtime": 2809.5847, "train_samples_per_second": 16.358, "train_steps_per_second": 0.128 } ], "logging_steps": 1.0, "max_steps": 360, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 100, "total_flos": 1.427028711613399e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }