{ "best_metric": 0.98815, "best_model_checkpoint": "/media/hongss/ssd/hatespeech_modelbackup/new_weights/1216/KoElectra_123/stage1/20241217T17-29-02/checkpoint-275000", "epoch": 27.5, "eval_steps": 5000, "global_step": 275000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.8010932207107544, "learning_rate": 5.000000000000001e-07, "loss": 0.6941, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.8222021460533142, "learning_rate": 1.0000000000000002e-06, "loss": 0.6895, "step": 200 }, { "epoch": 0.03, "grad_norm": 1.551395297050476, "learning_rate": 1.5e-06, "loss": 0.6726, "step": 300 }, { "epoch": 0.04, "grad_norm": 1.8527400493621826, "learning_rate": 2.0000000000000003e-06, "loss": 0.5988, "step": 400 }, { "epoch": 0.05, "grad_norm": 6.536347389221191, "learning_rate": 2.5e-06, "loss": 0.4735, "step": 500 }, { "epoch": 0.06, "grad_norm": 1.1166355609893799, "learning_rate": 3e-06, "loss": 0.3591, "step": 600 }, { "epoch": 0.07, "grad_norm": 1.7112144231796265, "learning_rate": 3.5e-06, "loss": 0.2803, "step": 700 }, { "epoch": 0.08, "grad_norm": 10.188007354736328, "learning_rate": 4.000000000000001e-06, "loss": 0.2336, "step": 800 }, { "epoch": 0.09, "grad_norm": 2.2752089500427246, "learning_rate": 4.5e-06, "loss": 0.1956, "step": 900 }, { "epoch": 0.1, "grad_norm": 14.095466613769531, "learning_rate": 5e-06, "loss": 0.1955, "step": 1000 }, { "epoch": 0.11, "grad_norm": 2.8912861347198486, "learning_rate": 4.998327759197325e-06, "loss": 0.1589, "step": 1100 }, { "epoch": 0.12, "grad_norm": 23.762054443359375, "learning_rate": 4.996655518394649e-06, "loss": 0.1708, "step": 1200 }, { "epoch": 0.13, "grad_norm": 0.42863136529922485, "learning_rate": 4.994983277591973e-06, "loss": 0.1266, "step": 1300 }, { "epoch": 0.14, "grad_norm": 27.861000061035156, "learning_rate": 4.993311036789298e-06, "loss": 0.1403, "step": 1400 }, { "epoch": 0.15, "grad_norm": 12.980772018432617, "learning_rate": 4.991638795986622e-06, "loss": 0.1378, "step": 1500 }, { "epoch": 0.16, "grad_norm": 0.17961889505386353, "learning_rate": 4.989966555183947e-06, "loss": 0.1168, "step": 1600 }, { "epoch": 0.17, "grad_norm": 1.3810911178588867, "learning_rate": 4.988294314381271e-06, "loss": 0.1129, "step": 1700 }, { "epoch": 0.18, "grad_norm": 22.552303314208984, "learning_rate": 4.986622073578596e-06, "loss": 0.1584, "step": 1800 }, { "epoch": 0.19, "grad_norm": 0.4360831081867218, "learning_rate": 4.98494983277592e-06, "loss": 0.1267, "step": 1900 }, { "epoch": 0.2, "grad_norm": 1.449009656906128, "learning_rate": 4.983277591973244e-06, "loss": 0.1199, "step": 2000 }, { "epoch": 0.21, "grad_norm": 1.0524966716766357, "learning_rate": 4.981605351170569e-06, "loss": 0.1293, "step": 2100 }, { "epoch": 0.22, "grad_norm": 25.5817813873291, "learning_rate": 4.979933110367893e-06, "loss": 0.0851, "step": 2200 }, { "epoch": 0.23, "grad_norm": 0.16489553451538086, "learning_rate": 4.978260869565218e-06, "loss": 0.1246, "step": 2300 }, { "epoch": 0.24, "grad_norm": 0.1531640738248825, "learning_rate": 4.976588628762542e-06, "loss": 0.1167, "step": 2400 }, { "epoch": 0.25, "grad_norm": 23.05634117126465, "learning_rate": 4.974916387959867e-06, "loss": 0.1099, "step": 2500 }, { "epoch": 0.26, "grad_norm": 0.16108916699886322, "learning_rate": 4.973244147157191e-06, "loss": 0.1278, "step": 2600 }, { "epoch": 0.27, "grad_norm": 0.17259421944618225, "learning_rate": 4.971571906354515e-06, "loss": 0.0692, "step": 2700 }, { "epoch": 0.28, "grad_norm": 9.081533432006836, "learning_rate": 4.96989966555184e-06, "loss": 0.1063, "step": 2800 }, { "epoch": 0.29, "grad_norm": 2.4863479137420654, "learning_rate": 4.968227424749164e-06, "loss": 0.1163, "step": 2900 }, { "epoch": 0.3, "grad_norm": 0.11070814728736877, "learning_rate": 4.966555183946489e-06, "loss": 0.0959, "step": 3000 }, { "epoch": 0.31, "grad_norm": 1.8638683557510376, "learning_rate": 4.964882943143814e-06, "loss": 0.084, "step": 3100 }, { "epoch": 0.32, "grad_norm": 0.3781122863292694, "learning_rate": 4.963210702341138e-06, "loss": 0.086, "step": 3200 }, { "epoch": 0.33, "grad_norm": 0.08686799556016922, "learning_rate": 4.961538461538462e-06, "loss": 0.0905, "step": 3300 }, { "epoch": 0.34, "grad_norm": 0.08633287996053696, "learning_rate": 4.959866220735786e-06, "loss": 0.0843, "step": 3400 }, { "epoch": 0.35, "grad_norm": 0.28560230135917664, "learning_rate": 4.958193979933111e-06, "loss": 0.0929, "step": 3500 }, { "epoch": 0.36, "grad_norm": 0.07190747559070587, "learning_rate": 4.9565217391304355e-06, "loss": 0.0889, "step": 3600 }, { "epoch": 0.37, "grad_norm": 25.537450790405273, "learning_rate": 4.9548494983277596e-06, "loss": 0.0946, "step": 3700 }, { "epoch": 0.38, "grad_norm": 8.443319320678711, "learning_rate": 4.9531772575250845e-06, "loss": 0.1029, "step": 3800 }, { "epoch": 0.39, "grad_norm": 0.08515575528144836, "learning_rate": 4.9515050167224085e-06, "loss": 0.0964, "step": 3900 }, { "epoch": 0.4, "grad_norm": 0.10333725810050964, "learning_rate": 4.9498327759197325e-06, "loss": 0.0977, "step": 4000 }, { "epoch": 0.41, "grad_norm": 0.041944719851017, "learning_rate": 4.948160535117057e-06, "loss": 0.0922, "step": 4100 }, { "epoch": 0.42, "grad_norm": 3.2756776809692383, "learning_rate": 4.9464882943143815e-06, "loss": 0.0675, "step": 4200 }, { "epoch": 0.43, "grad_norm": 6.24823522567749, "learning_rate": 4.944816053511706e-06, "loss": 0.0869, "step": 4300 }, { "epoch": 0.44, "grad_norm": 0.09910300374031067, "learning_rate": 4.94314381270903e-06, "loss": 0.1091, "step": 4400 }, { "epoch": 0.45, "grad_norm": 0.7329888939857483, "learning_rate": 4.941471571906355e-06, "loss": 0.1076, "step": 4500 }, { "epoch": 0.46, "grad_norm": 0.031429223716259, "learning_rate": 4.939799331103679e-06, "loss": 0.1021, "step": 4600 }, { "epoch": 0.47, "grad_norm": 0.47189056873321533, "learning_rate": 4.938127090301003e-06, "loss": 0.0815, "step": 4700 }, { "epoch": 0.48, "grad_norm": 7.491260051727295, "learning_rate": 4.936454849498328e-06, "loss": 0.1312, "step": 4800 }, { "epoch": 0.49, "grad_norm": 42.855648040771484, "learning_rate": 4.934782608695652e-06, "loss": 0.0784, "step": 4900 }, { "epoch": 0.5, "grad_norm": 13.520151138305664, "learning_rate": 4.933110367892977e-06, "loss": 0.0815, "step": 5000 }, { "epoch": 0.5, "eval_accuracy": 0.979275, "eval_f1": 0.979275, "eval_loss": 0.07820390909910202, "eval_runtime": 134.6565, "eval_samples_per_second": 297.052, "eval_steps_per_second": 297.052, "step": 5000 }, { "epoch": 0.51, "grad_norm": 4.115912437438965, "learning_rate": 4.931438127090302e-06, "loss": 0.1083, "step": 5100 }, { "epoch": 0.52, "grad_norm": 18.950531005859375, "learning_rate": 4.929765886287626e-06, "loss": 0.0754, "step": 5200 }, { "epoch": 0.53, "grad_norm": 0.024242494255304337, "learning_rate": 4.92809364548495e-06, "loss": 0.0787, "step": 5300 }, { "epoch": 0.54, "grad_norm": 8.753853797912598, "learning_rate": 4.926421404682274e-06, "loss": 0.0824, "step": 5400 }, { "epoch": 0.55, "grad_norm": 8.40188217163086, "learning_rate": 4.924749163879599e-06, "loss": 0.0802, "step": 5500 }, { "epoch": 0.56, "grad_norm": 17.343854904174805, "learning_rate": 4.923076923076924e-06, "loss": 0.0784, "step": 5600 }, { "epoch": 0.57, "grad_norm": 0.23710471391677856, "learning_rate": 4.921404682274248e-06, "loss": 0.0989, "step": 5700 }, { "epoch": 0.58, "grad_norm": 108.25338745117188, "learning_rate": 4.919732441471573e-06, "loss": 0.0836, "step": 5800 }, { "epoch": 0.59, "grad_norm": 0.040300995111465454, "learning_rate": 4.918060200668897e-06, "loss": 0.0674, "step": 5900 }, { "epoch": 0.6, "grad_norm": 0.05142529681324959, "learning_rate": 4.916387959866221e-06, "loss": 0.0854, "step": 6000 }, { "epoch": 0.61, "grad_norm": 0.2920592427253723, "learning_rate": 4.914715719063545e-06, "loss": 0.0603, "step": 6100 }, { "epoch": 0.62, "grad_norm": 4.042372226715088, "learning_rate": 4.91304347826087e-06, "loss": 0.0918, "step": 6200 }, { "epoch": 0.63, "grad_norm": 2.157938241958618, "learning_rate": 4.911371237458195e-06, "loss": 0.0704, "step": 6300 }, { "epoch": 0.64, "grad_norm": 0.40491366386413574, "learning_rate": 4.909698996655519e-06, "loss": 0.0679, "step": 6400 }, { "epoch": 0.65, "grad_norm": 0.10786664485931396, "learning_rate": 4.908026755852844e-06, "loss": 0.0915, "step": 6500 }, { "epoch": 0.66, "grad_norm": 11.927928924560547, "learning_rate": 4.906354515050168e-06, "loss": 0.0622, "step": 6600 }, { "epoch": 0.67, "grad_norm": 0.019695168361067772, "learning_rate": 4.904682274247492e-06, "loss": 0.0755, "step": 6700 }, { "epoch": 0.68, "grad_norm": 0.06486581265926361, "learning_rate": 4.9030100334448166e-06, "loss": 0.0558, "step": 6800 }, { "epoch": 0.69, "grad_norm": 16.396251678466797, "learning_rate": 4.901337792642141e-06, "loss": 0.0648, "step": 6900 }, { "epoch": 0.7, "grad_norm": 0.3079834580421448, "learning_rate": 4.8996655518394655e-06, "loss": 0.0842, "step": 7000 }, { "epoch": 0.71, "grad_norm": 1.2089134454727173, "learning_rate": 4.8979933110367895e-06, "loss": 0.0754, "step": 7100 }, { "epoch": 0.72, "grad_norm": 0.028676798567175865, "learning_rate": 4.896321070234114e-06, "loss": 0.0802, "step": 7200 }, { "epoch": 0.73, "grad_norm": 0.04169798642396927, "learning_rate": 4.8946488294314385e-06, "loss": 0.0654, "step": 7300 }, { "epoch": 0.74, "grad_norm": 8.176326751708984, "learning_rate": 4.8929765886287625e-06, "loss": 0.0747, "step": 7400 }, { "epoch": 0.75, "grad_norm": 0.2586785852909088, "learning_rate": 4.891304347826087e-06, "loss": 0.0716, "step": 7500 }, { "epoch": 0.76, "grad_norm": 0.4150737226009369, "learning_rate": 4.889632107023411e-06, "loss": 0.0481, "step": 7600 }, { "epoch": 0.77, "grad_norm": 0.011774115264415741, "learning_rate": 4.887959866220736e-06, "loss": 0.0552, "step": 7700 }, { "epoch": 0.78, "grad_norm": 0.0569203644990921, "learning_rate": 4.886287625418061e-06, "loss": 0.0598, "step": 7800 }, { "epoch": 0.79, "grad_norm": 16.26318359375, "learning_rate": 4.884615384615385e-06, "loss": 0.0769, "step": 7900 }, { "epoch": 0.8, "grad_norm": 0.13142751157283783, "learning_rate": 4.882943143812709e-06, "loss": 0.0833, "step": 8000 }, { "epoch": 0.81, "grad_norm": 0.16441769897937775, "learning_rate": 4.881270903010033e-06, "loss": 0.0882, "step": 8100 }, { "epoch": 0.82, "grad_norm": 0.3407476544380188, "learning_rate": 4.879598662207358e-06, "loss": 0.0675, "step": 8200 }, { "epoch": 0.83, "grad_norm": 0.1758737415075302, "learning_rate": 4.877926421404683e-06, "loss": 0.0703, "step": 8300 }, { "epoch": 0.84, "grad_norm": 0.0441744327545166, "learning_rate": 4.876254180602007e-06, "loss": 0.0617, "step": 8400 }, { "epoch": 0.85, "grad_norm": 0.028383277356624603, "learning_rate": 4.874581939799332e-06, "loss": 0.0633, "step": 8500 }, { "epoch": 0.86, "grad_norm": 0.03873937577009201, "learning_rate": 4.872909698996656e-06, "loss": 0.0737, "step": 8600 }, { "epoch": 0.87, "grad_norm": 0.0403941310942173, "learning_rate": 4.87123745819398e-06, "loss": 0.0633, "step": 8700 }, { "epoch": 0.88, "grad_norm": 35.07930374145508, "learning_rate": 4.869565217391305e-06, "loss": 0.0547, "step": 8800 }, { "epoch": 0.89, "grad_norm": 0.07603854686021805, "learning_rate": 4.867892976588629e-06, "loss": 0.066, "step": 8900 }, { "epoch": 0.9, "grad_norm": 19.292327880859375, "learning_rate": 4.866220735785954e-06, "loss": 0.0535, "step": 9000 }, { "epoch": 0.91, "grad_norm": 0.07600151747465134, "learning_rate": 4.864548494983278e-06, "loss": 0.0829, "step": 9100 }, { "epoch": 0.92, "grad_norm": 0.5552844405174255, "learning_rate": 4.862876254180603e-06, "loss": 0.067, "step": 9200 }, { "epoch": 0.93, "grad_norm": 0.020827770233154297, "learning_rate": 4.861204013377927e-06, "loss": 0.0563, "step": 9300 }, { "epoch": 0.94, "grad_norm": 0.10981284081935883, "learning_rate": 4.859531772575251e-06, "loss": 0.0773, "step": 9400 }, { "epoch": 0.95, "grad_norm": 0.03421054407954216, "learning_rate": 4.857859531772576e-06, "loss": 0.0646, "step": 9500 }, { "epoch": 0.96, "grad_norm": 0.17162667214870453, "learning_rate": 4.8561872909699e-06, "loss": 0.0518, "step": 9600 }, { "epoch": 0.97, "grad_norm": 0.10754731297492981, "learning_rate": 4.854515050167225e-06, "loss": 0.0709, "step": 9700 }, { "epoch": 0.98, "grad_norm": 0.033437736332416534, "learning_rate": 4.852842809364549e-06, "loss": 0.0629, "step": 9800 }, { "epoch": 0.99, "grad_norm": 0.021083764731884003, "learning_rate": 4.851170568561874e-06, "loss": 0.0757, "step": 9900 }, { "epoch": 1.0, "grad_norm": 6.52938985824585, "learning_rate": 4.849498327759198e-06, "loss": 0.0585, "step": 10000 }, { "epoch": 1.0, "eval_accuracy": 0.983525, "eval_f1": 0.983525, "eval_loss": 0.07067874819040298, "eval_runtime": 133.7009, "eval_samples_per_second": 299.175, "eval_steps_per_second": 299.175, "step": 10000 }, { "epoch": 1.01, "grad_norm": 0.08288461714982986, "learning_rate": 4.847826086956522e-06, "loss": 0.0384, "step": 10100 }, { "epoch": 1.02, "grad_norm": 0.030511723831295967, "learning_rate": 4.8461538461538465e-06, "loss": 0.0564, "step": 10200 }, { "epoch": 1.03, "grad_norm": 0.006339346989989281, "learning_rate": 4.844481605351171e-06, "loss": 0.049, "step": 10300 }, { "epoch": 1.04, "grad_norm": 5.46108341217041, "learning_rate": 4.8428093645484955e-06, "loss": 0.0649, "step": 10400 }, { "epoch": 1.05, "grad_norm": 1.3494547605514526, "learning_rate": 4.8411371237458195e-06, "loss": 0.0512, "step": 10500 }, { "epoch": 1.06, "grad_norm": 3.752967357635498, "learning_rate": 4.839464882943144e-06, "loss": 0.0527, "step": 10600 }, { "epoch": 1.07, "grad_norm": 20.542871475219727, "learning_rate": 4.837792642140468e-06, "loss": 0.0497, "step": 10700 }, { "epoch": 1.08, "grad_norm": 24.39754867553711, "learning_rate": 4.836120401337793e-06, "loss": 0.0653, "step": 10800 }, { "epoch": 1.09, "grad_norm": 0.9044870138168335, "learning_rate": 4.834448160535117e-06, "loss": 0.0375, "step": 10900 }, { "epoch": 1.1, "grad_norm": 8.592361450195312, "learning_rate": 4.832775919732442e-06, "loss": 0.0429, "step": 11000 }, { "epoch": 1.11, "grad_norm": 0.018932342529296875, "learning_rate": 4.831103678929766e-06, "loss": 0.0323, "step": 11100 }, { "epoch": 1.12, "grad_norm": 0.08527997881174088, "learning_rate": 4.82943143812709e-06, "loss": 0.0803, "step": 11200 }, { "epoch": 1.13, "grad_norm": 0.010842899791896343, "learning_rate": 4.827759197324415e-06, "loss": 0.0389, "step": 11300 }, { "epoch": 1.1400000000000001, "grad_norm": 0.025464480742812157, "learning_rate": 4.826086956521739e-06, "loss": 0.0477, "step": 11400 }, { "epoch": 1.15, "grad_norm": 0.29770731925964355, "learning_rate": 4.824414715719064e-06, "loss": 0.0651, "step": 11500 }, { "epoch": 1.16, "grad_norm": 0.16732554137706757, "learning_rate": 4.822742474916388e-06, "loss": 0.0363, "step": 11600 }, { "epoch": 1.17, "grad_norm": 0.036259859800338745, "learning_rate": 4.821070234113713e-06, "loss": 0.0531, "step": 11700 }, { "epoch": 1.18, "grad_norm": 0.02447950281202793, "learning_rate": 4.819397993311037e-06, "loss": 0.0451, "step": 11800 }, { "epoch": 1.19, "grad_norm": 0.15884707868099213, "learning_rate": 4.817725752508361e-06, "loss": 0.0597, "step": 11900 }, { "epoch": 1.2, "grad_norm": 12.805810928344727, "learning_rate": 4.816053511705686e-06, "loss": 0.0455, "step": 12000 }, { "epoch": 1.21, "grad_norm": 0.025491496548056602, "learning_rate": 4.81438127090301e-06, "loss": 0.0615, "step": 12100 }, { "epoch": 1.22, "grad_norm": 0.6481452584266663, "learning_rate": 4.812709030100335e-06, "loss": 0.0606, "step": 12200 }, { "epoch": 1.23, "grad_norm": 0.02471865341067314, "learning_rate": 4.81103678929766e-06, "loss": 0.0505, "step": 12300 }, { "epoch": 1.24, "grad_norm": 0.005726488307118416, "learning_rate": 4.809364548494984e-06, "loss": 0.0484, "step": 12400 }, { "epoch": 1.25, "grad_norm": 0.004293044563382864, "learning_rate": 4.807692307692308e-06, "loss": 0.0581, "step": 12500 }, { "epoch": 1.26, "grad_norm": 0.12100236862897873, "learning_rate": 4.806020066889632e-06, "loss": 0.0616, "step": 12600 }, { "epoch": 1.27, "grad_norm": 0.4037141501903534, "learning_rate": 4.804347826086957e-06, "loss": 0.0445, "step": 12700 }, { "epoch": 1.28, "grad_norm": 5.907632827758789, "learning_rate": 4.802675585284282e-06, "loss": 0.0482, "step": 12800 }, { "epoch": 1.29, "grad_norm": 11.875676155090332, "learning_rate": 4.801003344481606e-06, "loss": 0.0406, "step": 12900 }, { "epoch": 1.3, "grad_norm": 0.1897820234298706, "learning_rate": 4.799331103678931e-06, "loss": 0.0603, "step": 13000 }, { "epoch": 1.31, "grad_norm": 0.04131534695625305, "learning_rate": 4.797658862876255e-06, "loss": 0.0395, "step": 13100 }, { "epoch": 1.32, "grad_norm": 0.11377372592687607, "learning_rate": 4.795986622073579e-06, "loss": 0.059, "step": 13200 }, { "epoch": 1.33, "grad_norm": 0.017639851197600365, "learning_rate": 4.7943143812709035e-06, "loss": 0.0611, "step": 13300 }, { "epoch": 1.34, "grad_norm": 0.03218785300850868, "learning_rate": 4.792642140468228e-06, "loss": 0.0502, "step": 13400 }, { "epoch": 1.35, "grad_norm": 0.049068257212638855, "learning_rate": 4.7909698996655525e-06, "loss": 0.0645, "step": 13500 }, { "epoch": 1.3599999999999999, "grad_norm": 0.014554962515830994, "learning_rate": 4.7892976588628765e-06, "loss": 0.0446, "step": 13600 }, { "epoch": 1.37, "grad_norm": 0.10987705737352371, "learning_rate": 4.787625418060201e-06, "loss": 0.0512, "step": 13700 }, { "epoch": 1.38, "grad_norm": 7.821701526641846, "learning_rate": 4.785953177257525e-06, "loss": 0.0552, "step": 13800 }, { "epoch": 1.3900000000000001, "grad_norm": 0.0972401350736618, "learning_rate": 4.7842809364548495e-06, "loss": 0.0482, "step": 13900 }, { "epoch": 1.4, "grad_norm": 0.009446914307773113, "learning_rate": 4.782608695652174e-06, "loss": 0.0332, "step": 14000 }, { "epoch": 1.41, "grad_norm": 1.3108714818954468, "learning_rate": 4.780936454849498e-06, "loss": 0.0899, "step": 14100 }, { "epoch": 1.42, "grad_norm": 0.04676111787557602, "learning_rate": 4.779264214046823e-06, "loss": 0.0494, "step": 14200 }, { "epoch": 1.43, "grad_norm": 15.421358108520508, "learning_rate": 4.777591973244148e-06, "loss": 0.0521, "step": 14300 }, { "epoch": 1.44, "grad_norm": 0.015584264881908894, "learning_rate": 4.775919732441472e-06, "loss": 0.0453, "step": 14400 }, { "epoch": 1.45, "grad_norm": 3.9511654376983643, "learning_rate": 4.774247491638796e-06, "loss": 0.045, "step": 14500 }, { "epoch": 1.46, "grad_norm": 0.09137571603059769, "learning_rate": 4.77257525083612e-06, "loss": 0.0607, "step": 14600 }, { "epoch": 1.47, "grad_norm": 19.993337631225586, "learning_rate": 4.770903010033445e-06, "loss": 0.043, "step": 14700 }, { "epoch": 1.48, "grad_norm": 0.03419216349720955, "learning_rate": 4.76923076923077e-06, "loss": 0.0561, "step": 14800 }, { "epoch": 1.49, "grad_norm": 0.017296940088272095, "learning_rate": 4.767558528428094e-06, "loss": 0.0296, "step": 14900 }, { "epoch": 1.5, "grad_norm": 4.5799407958984375, "learning_rate": 4.765886287625419e-06, "loss": 0.044, "step": 15000 }, { "epoch": 1.5, "eval_accuracy": 0.983825, "eval_f1": 0.983825, "eval_loss": 0.07533564418554306, "eval_runtime": 133.0916, "eval_samples_per_second": 300.545, "eval_steps_per_second": 300.545, "step": 15000 }, { "epoch": 1.51, "grad_norm": 0.01647213287651539, "learning_rate": 4.764214046822743e-06, "loss": 0.0514, "step": 15100 }, { "epoch": 1.52, "grad_norm": 0.01267435122281313, "learning_rate": 4.762541806020067e-06, "loss": 0.0448, "step": 15200 }, { "epoch": 1.53, "grad_norm": 60.150901794433594, "learning_rate": 4.760869565217392e-06, "loss": 0.0625, "step": 15300 }, { "epoch": 1.54, "grad_norm": 0.058160994201898575, "learning_rate": 4.759197324414716e-06, "loss": 0.0493, "step": 15400 }, { "epoch": 1.55, "grad_norm": 0.02377147413790226, "learning_rate": 4.757525083612041e-06, "loss": 0.053, "step": 15500 }, { "epoch": 1.56, "grad_norm": 0.02492053434252739, "learning_rate": 4.755852842809365e-06, "loss": 0.0499, "step": 15600 }, { "epoch": 1.5699999999999998, "grad_norm": 0.069520004093647, "learning_rate": 4.75418060200669e-06, "loss": 0.0475, "step": 15700 }, { "epoch": 1.58, "grad_norm": 0.18645954132080078, "learning_rate": 4.752508361204014e-06, "loss": 0.0528, "step": 15800 }, { "epoch": 1.5899999999999999, "grad_norm": 0.7005010843276978, "learning_rate": 4.750836120401338e-06, "loss": 0.0554, "step": 15900 }, { "epoch": 1.6, "grad_norm": 11.44532299041748, "learning_rate": 4.749163879598663e-06, "loss": 0.0453, "step": 16000 }, { "epoch": 1.6099999999999999, "grad_norm": 0.028075769543647766, "learning_rate": 4.747491638795987e-06, "loss": 0.0454, "step": 16100 }, { "epoch": 1.62, "grad_norm": 11.789948463439941, "learning_rate": 4.745819397993312e-06, "loss": 0.0456, "step": 16200 }, { "epoch": 1.63, "grad_norm": 0.11324401199817657, "learning_rate": 4.744147157190636e-06, "loss": 0.0549, "step": 16300 }, { "epoch": 1.6400000000000001, "grad_norm": 49.08498001098633, "learning_rate": 4.7424749163879605e-06, "loss": 0.0346, "step": 16400 }, { "epoch": 1.65, "grad_norm": 0.05506042763590813, "learning_rate": 4.740802675585285e-06, "loss": 0.0629, "step": 16500 }, { "epoch": 1.6600000000000001, "grad_norm": 0.012054430320858955, "learning_rate": 4.739130434782609e-06, "loss": 0.0246, "step": 16600 }, { "epoch": 1.67, "grad_norm": 0.14489546418190002, "learning_rate": 4.7374581939799335e-06, "loss": 0.0803, "step": 16700 }, { "epoch": 1.6800000000000002, "grad_norm": 12.88791275024414, "learning_rate": 4.7357859531772575e-06, "loss": 0.0484, "step": 16800 }, { "epoch": 1.69, "grad_norm": 6.472039222717285, "learning_rate": 4.7341137123745824e-06, "loss": 0.0505, "step": 16900 }, { "epoch": 1.7, "grad_norm": 0.09024617075920105, "learning_rate": 4.732441471571907e-06, "loss": 0.0498, "step": 17000 }, { "epoch": 1.71, "grad_norm": 0.6520803570747375, "learning_rate": 4.730769230769231e-06, "loss": 0.0553, "step": 17100 }, { "epoch": 1.72, "grad_norm": 0.04779316112399101, "learning_rate": 4.729096989966555e-06, "loss": 0.0677, "step": 17200 }, { "epoch": 1.73, "grad_norm": 0.020316898822784424, "learning_rate": 4.727424749163879e-06, "loss": 0.0414, "step": 17300 }, { "epoch": 1.74, "grad_norm": 16.33078384399414, "learning_rate": 4.725752508361204e-06, "loss": 0.0506, "step": 17400 }, { "epoch": 1.75, "grad_norm": 0.002986336126923561, "learning_rate": 4.724080267558529e-06, "loss": 0.0328, "step": 17500 }, { "epoch": 1.76, "grad_norm": 10.395185470581055, "learning_rate": 4.722408026755853e-06, "loss": 0.0483, "step": 17600 }, { "epoch": 1.77, "grad_norm": 0.003912128042429686, "learning_rate": 4.720735785953178e-06, "loss": 0.0541, "step": 17700 }, { "epoch": 1.78, "grad_norm": 8.186915397644043, "learning_rate": 4.719063545150502e-06, "loss": 0.0455, "step": 17800 }, { "epoch": 1.79, "grad_norm": 0.02249542437493801, "learning_rate": 4.717391304347826e-06, "loss": 0.0303, "step": 17900 }, { "epoch": 1.8, "grad_norm": 0.6102803945541382, "learning_rate": 4.715719063545151e-06, "loss": 0.049, "step": 18000 }, { "epoch": 1.81, "grad_norm": 0.005787149537354708, "learning_rate": 4.714046822742475e-06, "loss": 0.0495, "step": 18100 }, { "epoch": 1.8199999999999998, "grad_norm": 0.01614544540643692, "learning_rate": 4.7123745819398e-06, "loss": 0.0506, "step": 18200 }, { "epoch": 1.83, "grad_norm": 0.06813820451498032, "learning_rate": 4.710702341137124e-06, "loss": 0.0303, "step": 18300 }, { "epoch": 1.8399999999999999, "grad_norm": 0.004097160417586565, "learning_rate": 4.709030100334449e-06, "loss": 0.0447, "step": 18400 }, { "epoch": 1.85, "grad_norm": 0.0063947769813239574, "learning_rate": 4.707357859531773e-06, "loss": 0.0497, "step": 18500 }, { "epoch": 1.8599999999999999, "grad_norm": 0.026598848402500153, "learning_rate": 4.705685618729097e-06, "loss": 0.066, "step": 18600 }, { "epoch": 1.87, "grad_norm": 0.014818851836025715, "learning_rate": 4.704013377926422e-06, "loss": 0.0358, "step": 18700 }, { "epoch": 1.88, "grad_norm": 0.0058324155397713184, "learning_rate": 4.702341137123746e-06, "loss": 0.0582, "step": 18800 }, { "epoch": 1.8900000000000001, "grad_norm": 13.650464057922363, "learning_rate": 4.700668896321071e-06, "loss": 0.0548, "step": 18900 }, { "epoch": 1.9, "grad_norm": 0.02169709838926792, "learning_rate": 4.698996655518395e-06, "loss": 0.0441, "step": 19000 }, { "epoch": 1.9100000000000001, "grad_norm": 17.301109313964844, "learning_rate": 4.69732441471572e-06, "loss": 0.0583, "step": 19100 }, { "epoch": 1.92, "grad_norm": 0.07251980900764465, "learning_rate": 4.695652173913044e-06, "loss": 0.0434, "step": 19200 }, { "epoch": 1.9300000000000002, "grad_norm": 2.6538381576538086, "learning_rate": 4.693979933110368e-06, "loss": 0.052, "step": 19300 }, { "epoch": 1.94, "grad_norm": 0.03304781764745712, "learning_rate": 4.692307692307693e-06, "loss": 0.0305, "step": 19400 }, { "epoch": 1.95, "grad_norm": 0.0017205484909936786, "learning_rate": 4.6906354515050175e-06, "loss": 0.0263, "step": 19500 }, { "epoch": 1.96, "grad_norm": 0.002642621286213398, "learning_rate": 4.688963210702342e-06, "loss": 0.0445, "step": 19600 }, { "epoch": 1.97, "grad_norm": 3.4412660598754883, "learning_rate": 4.687290969899666e-06, "loss": 0.043, "step": 19700 }, { "epoch": 1.98, "grad_norm": 2.002504587173462, "learning_rate": 4.6856187290969905e-06, "loss": 0.0745, "step": 19800 }, { "epoch": 1.99, "grad_norm": 0.029486706480383873, "learning_rate": 4.6839464882943145e-06, "loss": 0.0444, "step": 19900 }, { "epoch": 2.0, "grad_norm": 0.4088901877403259, "learning_rate": 4.6822742474916394e-06, "loss": 0.0299, "step": 20000 }, { "epoch": 2.0, "eval_accuracy": 0.984325, "eval_f1": 0.984325, "eval_loss": 0.07127311825752258, "eval_runtime": 133.5214, "eval_samples_per_second": 299.577, "eval_steps_per_second": 299.577, "step": 20000 }, { "epoch": 2.01, "grad_norm": 0.011883600614964962, "learning_rate": 4.6806020066889635e-06, "loss": 0.0421, "step": 20100 }, { "epoch": 2.02, "grad_norm": 9.133939743041992, "learning_rate": 4.678929765886288e-06, "loss": 0.0409, "step": 20200 }, { "epoch": 2.03, "grad_norm": 0.11910539120435715, "learning_rate": 4.677257525083612e-06, "loss": 0.0288, "step": 20300 }, { "epoch": 2.04, "grad_norm": 0.025438936427235603, "learning_rate": 4.675585284280936e-06, "loss": 0.0236, "step": 20400 }, { "epoch": 2.05, "grad_norm": 0.008798012509942055, "learning_rate": 4.673913043478261e-06, "loss": 0.0292, "step": 20500 }, { "epoch": 2.06, "grad_norm": 0.0025494080036878586, "learning_rate": 4.672240802675585e-06, "loss": 0.0193, "step": 20600 }, { "epoch": 2.07, "grad_norm": 0.011349349282681942, "learning_rate": 4.67056856187291e-06, "loss": 0.0249, "step": 20700 }, { "epoch": 2.08, "grad_norm": 0.03200814127922058, "learning_rate": 4.668896321070234e-06, "loss": 0.03, "step": 20800 }, { "epoch": 2.09, "grad_norm": 0.013645422644913197, "learning_rate": 4.667224080267559e-06, "loss": 0.0245, "step": 20900 }, { "epoch": 2.1, "grad_norm": 0.006069705821573734, "learning_rate": 4.665551839464883e-06, "loss": 0.0494, "step": 21000 }, { "epoch": 2.11, "grad_norm": 0.9513150453567505, "learning_rate": 4.663879598662207e-06, "loss": 0.0274, "step": 21100 }, { "epoch": 2.12, "grad_norm": 0.021194346249103546, "learning_rate": 4.662207357859532e-06, "loss": 0.0186, "step": 21200 }, { "epoch": 2.13, "grad_norm": 0.010533172637224197, "learning_rate": 4.660535117056856e-06, "loss": 0.0268, "step": 21300 }, { "epoch": 2.14, "grad_norm": 0.013652130030095577, "learning_rate": 4.658862876254181e-06, "loss": 0.0216, "step": 21400 }, { "epoch": 2.15, "grad_norm": 1.2660781145095825, "learning_rate": 4.657190635451506e-06, "loss": 0.0289, "step": 21500 }, { "epoch": 2.16, "grad_norm": 0.008349668234586716, "learning_rate": 4.65551839464883e-06, "loss": 0.0359, "step": 21600 }, { "epoch": 2.17, "grad_norm": 1.1475175619125366, "learning_rate": 4.653846153846154e-06, "loss": 0.018, "step": 21700 }, { "epoch": 2.18, "grad_norm": 0.06436525285243988, "learning_rate": 4.652173913043478e-06, "loss": 0.0274, "step": 21800 }, { "epoch": 2.19, "grad_norm": 0.004730283282697201, "learning_rate": 4.650501672240803e-06, "loss": 0.0321, "step": 21900 }, { "epoch": 2.2, "grad_norm": 0.03142133727669716, "learning_rate": 4.648829431438128e-06, "loss": 0.0309, "step": 22000 }, { "epoch": 2.21, "grad_norm": 0.07798143476247787, "learning_rate": 4.647157190635452e-06, "loss": 0.0197, "step": 22100 }, { "epoch": 2.22, "grad_norm": 0.0035459815990179777, "learning_rate": 4.645484949832777e-06, "loss": 0.0293, "step": 22200 }, { "epoch": 2.23, "grad_norm": 0.04352254420518875, "learning_rate": 4.643812709030101e-06, "loss": 0.0308, "step": 22300 }, { "epoch": 2.24, "grad_norm": 0.0022269452456384897, "learning_rate": 4.642140468227425e-06, "loss": 0.0367, "step": 22400 }, { "epoch": 2.25, "grad_norm": 0.0067388033494353294, "learning_rate": 4.64046822742475e-06, "loss": 0.0309, "step": 22500 }, { "epoch": 2.26, "grad_norm": 0.0021720202639698982, "learning_rate": 4.638795986622074e-06, "loss": 0.0338, "step": 22600 }, { "epoch": 2.27, "grad_norm": 0.01130695827305317, "learning_rate": 4.637123745819399e-06, "loss": 0.0326, "step": 22700 }, { "epoch": 2.2800000000000002, "grad_norm": 0.027412299066781998, "learning_rate": 4.635451505016723e-06, "loss": 0.0288, "step": 22800 }, { "epoch": 2.29, "grad_norm": 23.797739028930664, "learning_rate": 4.6337792642140475e-06, "loss": 0.0237, "step": 22900 }, { "epoch": 2.3, "grad_norm": 0.016235172748565674, "learning_rate": 4.6321070234113715e-06, "loss": 0.037, "step": 23000 }, { "epoch": 2.31, "grad_norm": 0.0032159725669771433, "learning_rate": 4.630434782608696e-06, "loss": 0.0204, "step": 23100 }, { "epoch": 2.32, "grad_norm": 0.0026251012459397316, "learning_rate": 4.6287625418060205e-06, "loss": 0.0346, "step": 23200 }, { "epoch": 2.33, "grad_norm": 0.056268028914928436, "learning_rate": 4.6270903010033445e-06, "loss": 0.0349, "step": 23300 }, { "epoch": 2.34, "grad_norm": 32.879783630371094, "learning_rate": 4.625418060200669e-06, "loss": 0.0355, "step": 23400 }, { "epoch": 2.35, "grad_norm": 0.05592450127005577, "learning_rate": 4.623745819397994e-06, "loss": 0.0255, "step": 23500 }, { "epoch": 2.36, "grad_norm": 0.021603301167488098, "learning_rate": 4.622073578595318e-06, "loss": 0.0351, "step": 23600 }, { "epoch": 2.37, "grad_norm": 0.0038026655092835426, "learning_rate": 4.620401337792642e-06, "loss": 0.0212, "step": 23700 }, { "epoch": 2.38, "grad_norm": 0.002035396406427026, "learning_rate": 4.618729096989966e-06, "loss": 0.032, "step": 23800 }, { "epoch": 2.39, "grad_norm": 3.054328680038452, "learning_rate": 4.617056856187291e-06, "loss": 0.0295, "step": 23900 }, { "epoch": 2.4, "grad_norm": 0.004924594424664974, "learning_rate": 4.615384615384616e-06, "loss": 0.0157, "step": 24000 }, { "epoch": 2.41, "grad_norm": 0.0015326552093029022, "learning_rate": 4.61371237458194e-06, "loss": 0.0257, "step": 24100 }, { "epoch": 2.42, "grad_norm": 45.68626403808594, "learning_rate": 4.612040133779265e-06, "loss": 0.0402, "step": 24200 }, { "epoch": 2.43, "grad_norm": 8.742071151733398, "learning_rate": 4.610367892976589e-06, "loss": 0.0602, "step": 24300 }, { "epoch": 2.44, "grad_norm": 0.11714211106300354, "learning_rate": 4.608695652173913e-06, "loss": 0.0245, "step": 24400 }, { "epoch": 2.45, "grad_norm": 0.02803937904536724, "learning_rate": 4.607023411371238e-06, "loss": 0.0347, "step": 24500 }, { "epoch": 2.46, "grad_norm": 0.0011470599565654993, "learning_rate": 4.605351170568562e-06, "loss": 0.0214, "step": 24600 }, { "epoch": 2.4699999999999998, "grad_norm": 0.004855659324675798, "learning_rate": 4.603678929765887e-06, "loss": 0.025, "step": 24700 }, { "epoch": 2.48, "grad_norm": 0.12526443600654602, "learning_rate": 4.602006688963211e-06, "loss": 0.0208, "step": 24800 }, { "epoch": 2.49, "grad_norm": 0.06537730991840363, "learning_rate": 4.600334448160536e-06, "loss": 0.0489, "step": 24900 }, { "epoch": 2.5, "grad_norm": 0.023756371811032295, "learning_rate": 4.59866220735786e-06, "loss": 0.027, "step": 25000 }, { "epoch": 2.5, "eval_accuracy": 0.98445, "eval_f1": 0.98445, "eval_loss": 0.08253531157970428, "eval_runtime": 133.5816, "eval_samples_per_second": 299.442, "eval_steps_per_second": 299.442, "step": 25000 }, { "epoch": 2.51, "grad_norm": 0.04911258444190025, "learning_rate": 4.596989966555184e-06, "loss": 0.0412, "step": 25100 }, { "epoch": 2.52, "grad_norm": 0.2437671422958374, "learning_rate": 4.595317725752509e-06, "loss": 0.0226, "step": 25200 }, { "epoch": 2.5300000000000002, "grad_norm": 0.01876792125403881, "learning_rate": 4.593645484949833e-06, "loss": 0.032, "step": 25300 }, { "epoch": 2.54, "grad_norm": 0.05922747775912285, "learning_rate": 4.591973244147158e-06, "loss": 0.0256, "step": 25400 }, { "epoch": 2.55, "grad_norm": 0.008321844041347504, "learning_rate": 4.590301003344483e-06, "loss": 0.0189, "step": 25500 }, { "epoch": 2.56, "grad_norm": 37.782745361328125, "learning_rate": 4.588628762541807e-06, "loss": 0.0366, "step": 25600 }, { "epoch": 2.57, "grad_norm": 0.005182887427508831, "learning_rate": 4.586956521739131e-06, "loss": 0.0404, "step": 25700 }, { "epoch": 2.58, "grad_norm": 0.02100360207259655, "learning_rate": 4.585284280936455e-06, "loss": 0.0269, "step": 25800 }, { "epoch": 2.59, "grad_norm": 0.017707619816064835, "learning_rate": 4.58361204013378e-06, "loss": 0.0185, "step": 25900 }, { "epoch": 2.6, "grad_norm": 0.12977050244808197, "learning_rate": 4.581939799331104e-06, "loss": 0.0207, "step": 26000 }, { "epoch": 2.61, "grad_norm": 0.0024964429903775454, "learning_rate": 4.5802675585284286e-06, "loss": 0.0541, "step": 26100 }, { "epoch": 2.62, "grad_norm": 1.7643945217132568, "learning_rate": 4.5785953177257534e-06, "loss": 0.0255, "step": 26200 }, { "epoch": 2.63, "grad_norm": 0.13935905694961548, "learning_rate": 4.5769230769230775e-06, "loss": 0.0331, "step": 26300 }, { "epoch": 2.64, "grad_norm": 0.010719064623117447, "learning_rate": 4.5752508361204015e-06, "loss": 0.025, "step": 26400 }, { "epoch": 2.65, "grad_norm": 0.039186276495456696, "learning_rate": 4.5735785953177255e-06, "loss": 0.0459, "step": 26500 }, { "epoch": 2.66, "grad_norm": 0.06738638877868652, "learning_rate": 4.5719063545150504e-06, "loss": 0.0178, "step": 26600 }, { "epoch": 2.67, "grad_norm": 0.014554286375641823, "learning_rate": 4.570234113712375e-06, "loss": 0.041, "step": 26700 }, { "epoch": 2.68, "grad_norm": 0.032161399722099304, "learning_rate": 4.568561872909699e-06, "loss": 0.0415, "step": 26800 }, { "epoch": 2.69, "grad_norm": 0.006425254512578249, "learning_rate": 4.566889632107024e-06, "loss": 0.0166, "step": 26900 }, { "epoch": 2.7, "grad_norm": 120.80119323730469, "learning_rate": 4.565217391304348e-06, "loss": 0.0515, "step": 27000 }, { "epoch": 2.71, "grad_norm": 0.09112800657749176, "learning_rate": 4.563545150501672e-06, "loss": 0.038, "step": 27100 }, { "epoch": 2.7199999999999998, "grad_norm": 0.24947082996368408, "learning_rate": 4.561872909698997e-06, "loss": 0.0302, "step": 27200 }, { "epoch": 2.73, "grad_norm": 0.03329513594508171, "learning_rate": 4.560200668896321e-06, "loss": 0.0277, "step": 27300 }, { "epoch": 2.74, "grad_norm": 0.010453999973833561, "learning_rate": 4.558528428093646e-06, "loss": 0.0205, "step": 27400 }, { "epoch": 2.75, "grad_norm": 0.18513476848602295, "learning_rate": 4.55685618729097e-06, "loss": 0.0176, "step": 27500 }, { "epoch": 2.76, "grad_norm": 0.016682449728250504, "learning_rate": 4.555183946488295e-06, "loss": 0.0419, "step": 27600 }, { "epoch": 2.77, "grad_norm": 0.0067235734313726425, "learning_rate": 4.553511705685619e-06, "loss": 0.0413, "step": 27700 }, { "epoch": 2.7800000000000002, "grad_norm": 0.008218110539019108, "learning_rate": 4.551839464882943e-06, "loss": 0.0258, "step": 27800 }, { "epoch": 2.79, "grad_norm": 0.006087880581617355, "learning_rate": 4.550167224080268e-06, "loss": 0.0102, "step": 27900 }, { "epoch": 2.8, "grad_norm": 0.0023886614944785833, "learning_rate": 4.548494983277592e-06, "loss": 0.024, "step": 28000 }, { "epoch": 2.81, "grad_norm": 74.37184143066406, "learning_rate": 4.546822742474917e-06, "loss": 0.0469, "step": 28100 }, { "epoch": 2.82, "grad_norm": 2.5758330821990967, "learning_rate": 4.545150501672241e-06, "loss": 0.0288, "step": 28200 }, { "epoch": 2.83, "grad_norm": 10.103140830993652, "learning_rate": 4.543478260869566e-06, "loss": 0.0369, "step": 28300 }, { "epoch": 2.84, "grad_norm": 36.231136322021484, "learning_rate": 4.54180602006689e-06, "loss": 0.0202, "step": 28400 }, { "epoch": 2.85, "grad_norm": 0.0008801018702797592, "learning_rate": 4.540133779264214e-06, "loss": 0.0331, "step": 28500 }, { "epoch": 2.86, "grad_norm": 5.780725955963135, "learning_rate": 4.538461538461539e-06, "loss": 0.0299, "step": 28600 }, { "epoch": 2.87, "grad_norm": 0.0014527951134368777, "learning_rate": 4.536789297658864e-06, "loss": 0.028, "step": 28700 }, { "epoch": 2.88, "grad_norm": 0.0038458879571408033, "learning_rate": 4.535117056856188e-06, "loss": 0.0192, "step": 28800 }, { "epoch": 2.89, "grad_norm": 0.04886145889759064, "learning_rate": 4.533444816053512e-06, "loss": 0.026, "step": 28900 }, { "epoch": 2.9, "grad_norm": 0.034207429736852646, "learning_rate": 4.531772575250837e-06, "loss": 0.0287, "step": 29000 }, { "epoch": 2.91, "grad_norm": 1.1304664611816406, "learning_rate": 4.530100334448161e-06, "loss": 0.0283, "step": 29100 }, { "epoch": 2.92, "grad_norm": 0.0010741673177108169, "learning_rate": 4.5284280936454856e-06, "loss": 0.0284, "step": 29200 }, { "epoch": 2.93, "grad_norm": 0.025598613545298576, "learning_rate": 4.52675585284281e-06, "loss": 0.0251, "step": 29300 }, { "epoch": 2.94, "grad_norm": 0.1095525324344635, "learning_rate": 4.5250836120401345e-06, "loss": 0.0306, "step": 29400 }, { "epoch": 2.95, "grad_norm": 6.888746738433838, "learning_rate": 4.5234113712374585e-06, "loss": 0.0336, "step": 29500 }, { "epoch": 2.96, "grad_norm": 0.03379179537296295, "learning_rate": 4.5217391304347826e-06, "loss": 0.0231, "step": 29600 }, { "epoch": 2.9699999999999998, "grad_norm": 0.00660817651078105, "learning_rate": 4.5200668896321074e-06, "loss": 0.0258, "step": 29700 }, { "epoch": 2.98, "grad_norm": 0.0639776960015297, "learning_rate": 4.5183946488294315e-06, "loss": 0.0459, "step": 29800 }, { "epoch": 2.99, "grad_norm": 0.025130316615104675, "learning_rate": 4.516722408026756e-06, "loss": 0.0371, "step": 29900 }, { "epoch": 3.0, "grad_norm": 12.60319709777832, "learning_rate": 4.51505016722408e-06, "loss": 0.0262, "step": 30000 }, { "epoch": 3.0, "eval_accuracy": 0.985425, "eval_f1": 0.985425, "eval_loss": 0.07783249020576477, "eval_runtime": 134.4077, "eval_samples_per_second": 297.602, "eval_steps_per_second": 297.602, "step": 30000 }, { "epoch": 3.01, "grad_norm": 0.3146221935749054, "learning_rate": 4.513377926421405e-06, "loss": 0.007, "step": 30100 }, { "epoch": 3.02, "grad_norm": 0.02126391790807247, "learning_rate": 4.511705685618729e-06, "loss": 0.0097, "step": 30200 }, { "epoch": 3.03, "grad_norm": 0.0014831286389380693, "learning_rate": 4.510033444816053e-06, "loss": 0.0172, "step": 30300 }, { "epoch": 3.04, "grad_norm": 0.011143388226628304, "learning_rate": 4.508361204013378e-06, "loss": 0.0233, "step": 30400 }, { "epoch": 3.05, "grad_norm": 0.005537919234484434, "learning_rate": 4.506688963210702e-06, "loss": 0.0375, "step": 30500 }, { "epoch": 3.06, "grad_norm": 0.06545744836330414, "learning_rate": 4.505016722408027e-06, "loss": 0.0218, "step": 30600 }, { "epoch": 3.07, "grad_norm": 0.01571493409574032, "learning_rate": 4.503344481605352e-06, "loss": 0.0149, "step": 30700 }, { "epoch": 3.08, "grad_norm": 0.0030460015404969454, "learning_rate": 4.501672240802676e-06, "loss": 0.0233, "step": 30800 }, { "epoch": 3.09, "grad_norm": 0.0006564293871633708, "learning_rate": 4.5e-06, "loss": 0.0068, "step": 30900 }, { "epoch": 3.1, "grad_norm": 0.009112296625971794, "learning_rate": 4.498327759197324e-06, "loss": 0.0067, "step": 31000 }, { "epoch": 3.11, "grad_norm": 20.75179100036621, "learning_rate": 4.496655518394649e-06, "loss": 0.0252, "step": 31100 }, { "epoch": 3.12, "grad_norm": 1.1207340955734253, "learning_rate": 4.494983277591974e-06, "loss": 0.0118, "step": 31200 }, { "epoch": 3.13, "grad_norm": 0.016456255689263344, "learning_rate": 4.493311036789298e-06, "loss": 0.035, "step": 31300 }, { "epoch": 3.14, "grad_norm": 0.0019782930612564087, "learning_rate": 4.491638795986623e-06, "loss": 0.0167, "step": 31400 }, { "epoch": 3.15, "grad_norm": 13.507879257202148, "learning_rate": 4.489966555183947e-06, "loss": 0.0295, "step": 31500 }, { "epoch": 3.16, "grad_norm": 0.37393996119499207, "learning_rate": 4.488294314381271e-06, "loss": 0.0119, "step": 31600 }, { "epoch": 3.17, "grad_norm": 0.001511622336693108, "learning_rate": 4.486622073578596e-06, "loss": 0.0075, "step": 31700 }, { "epoch": 3.18, "grad_norm": 0.005662756972014904, "learning_rate": 4.48494983277592e-06, "loss": 0.0215, "step": 31800 }, { "epoch": 3.19, "grad_norm": 0.042217742651700974, "learning_rate": 4.483277591973245e-06, "loss": 0.0124, "step": 31900 }, { "epoch": 3.2, "grad_norm": 7.794501781463623, "learning_rate": 4.481605351170569e-06, "loss": 0.0256, "step": 32000 }, { "epoch": 3.21, "grad_norm": 0.004403349477797747, "learning_rate": 4.479933110367894e-06, "loss": 0.0202, "step": 32100 }, { "epoch": 3.22, "grad_norm": 0.0071293930523097515, "learning_rate": 4.478260869565218e-06, "loss": 0.0138, "step": 32200 }, { "epoch": 3.23, "grad_norm": 0.020876120775938034, "learning_rate": 4.476588628762542e-06, "loss": 0.0137, "step": 32300 }, { "epoch": 3.24, "grad_norm": 0.18674761056900024, "learning_rate": 4.474916387959867e-06, "loss": 0.0183, "step": 32400 }, { "epoch": 3.25, "grad_norm": 0.09638151526451111, "learning_rate": 4.473244147157191e-06, "loss": 0.0308, "step": 32500 }, { "epoch": 3.26, "grad_norm": 0.0009971513645723462, "learning_rate": 4.4715719063545155e-06, "loss": 0.0204, "step": 32600 }, { "epoch": 3.27, "grad_norm": 0.044390495866537094, "learning_rate": 4.46989966555184e-06, "loss": 0.0177, "step": 32700 }, { "epoch": 3.2800000000000002, "grad_norm": 0.050020378082990646, "learning_rate": 4.4682274247491644e-06, "loss": 0.0114, "step": 32800 }, { "epoch": 3.29, "grad_norm": 18.936073303222656, "learning_rate": 4.4665551839464885e-06, "loss": 0.0184, "step": 32900 }, { "epoch": 3.3, "grad_norm": 0.008622376248240471, "learning_rate": 4.4648829431438125e-06, "loss": 0.0135, "step": 33000 }, { "epoch": 3.31, "grad_norm": 0.005060569848865271, "learning_rate": 4.463210702341137e-06, "loss": 0.0182, "step": 33100 }, { "epoch": 3.32, "grad_norm": 0.004180543590337038, "learning_rate": 4.461538461538462e-06, "loss": 0.0174, "step": 33200 }, { "epoch": 3.33, "grad_norm": 0.0003817367251031101, "learning_rate": 4.459866220735786e-06, "loss": 0.0163, "step": 33300 }, { "epoch": 3.34, "grad_norm": 0.02846093289554119, "learning_rate": 4.458193979933111e-06, "loss": 0.0343, "step": 33400 }, { "epoch": 3.35, "grad_norm": 0.05180152505636215, "learning_rate": 4.456521739130435e-06, "loss": 0.0273, "step": 33500 }, { "epoch": 3.36, "grad_norm": 0.02195872738957405, "learning_rate": 4.454849498327759e-06, "loss": 0.0139, "step": 33600 }, { "epoch": 3.37, "grad_norm": 0.07758418470621109, "learning_rate": 4.453177257525084e-06, "loss": 0.0185, "step": 33700 }, { "epoch": 3.38, "grad_norm": 38.56128692626953, "learning_rate": 4.451505016722408e-06, "loss": 0.0118, "step": 33800 }, { "epoch": 3.39, "grad_norm": 0.009722617454826832, "learning_rate": 4.449832775919733e-06, "loss": 0.0059, "step": 33900 }, { "epoch": 3.4, "grad_norm": 0.0015089679509401321, "learning_rate": 4.448160535117057e-06, "loss": 0.0358, "step": 34000 }, { "epoch": 3.41, "grad_norm": 0.01531163975596428, "learning_rate": 4.446488294314382e-06, "loss": 0.011, "step": 34100 }, { "epoch": 3.42, "grad_norm": 0.009290250949561596, "learning_rate": 4.444816053511706e-06, "loss": 0.0156, "step": 34200 }, { "epoch": 3.43, "grad_norm": 0.09569079428911209, "learning_rate": 4.44314381270903e-06, "loss": 0.0264, "step": 34300 }, { "epoch": 3.44, "grad_norm": 0.0036293207667768, "learning_rate": 4.441471571906355e-06, "loss": 0.0204, "step": 34400 }, { "epoch": 3.45, "grad_norm": 0.002437431598082185, "learning_rate": 4.439799331103679e-06, "loss": 0.0208, "step": 34500 }, { "epoch": 3.46, "grad_norm": 0.0011440415401011705, "learning_rate": 4.438127090301004e-06, "loss": 0.017, "step": 34600 }, { "epoch": 3.4699999999999998, "grad_norm": 0.03422205522656441, "learning_rate": 4.436454849498329e-06, "loss": 0.0068, "step": 34700 }, { "epoch": 3.48, "grad_norm": 11.362318992614746, "learning_rate": 4.434782608695653e-06, "loss": 0.0208, "step": 34800 }, { "epoch": 3.49, "grad_norm": 0.0038512239698320627, "learning_rate": 4.433110367892977e-06, "loss": 0.011, "step": 34900 }, { "epoch": 3.5, "grad_norm": 0.016066549345850945, "learning_rate": 4.431438127090301e-06, "loss": 0.0181, "step": 35000 }, { "epoch": 3.5, "eval_accuracy": 0.9857, "eval_f1": 0.9857, "eval_loss": 0.09534145146608353, "eval_runtime": 134.3932, "eval_samples_per_second": 297.634, "eval_steps_per_second": 297.634, "step": 35000 }, { "epoch": 3.51, "grad_norm": 0.0015177098102867603, "learning_rate": 4.429765886287626e-06, "loss": 0.0243, "step": 35100 }, { "epoch": 3.52, "grad_norm": 0.011164546944200993, "learning_rate": 4.428093645484951e-06, "loss": 0.0174, "step": 35200 }, { "epoch": 3.5300000000000002, "grad_norm": 0.4808778762817383, "learning_rate": 4.426421404682275e-06, "loss": 0.017, "step": 35300 }, { "epoch": 3.54, "grad_norm": 0.07465644180774689, "learning_rate": 4.4247491638795996e-06, "loss": 0.026, "step": 35400 }, { "epoch": 3.55, "grad_norm": 0.05953597277402878, "learning_rate": 4.423076923076924e-06, "loss": 0.0343, "step": 35500 }, { "epoch": 3.56, "grad_norm": 0.2559371590614319, "learning_rate": 4.421404682274248e-06, "loss": 0.0203, "step": 35600 }, { "epoch": 3.57, "grad_norm": 0.000921146129257977, "learning_rate": 4.4197324414715725e-06, "loss": 0.0174, "step": 35700 }, { "epoch": 3.58, "grad_norm": 0.03776590898633003, "learning_rate": 4.4180602006688966e-06, "loss": 0.0152, "step": 35800 }, { "epoch": 3.59, "grad_norm": 9.220237731933594, "learning_rate": 4.4163879598662214e-06, "loss": 0.0214, "step": 35900 }, { "epoch": 3.6, "grad_norm": 0.0010363379260525107, "learning_rate": 4.4147157190635455e-06, "loss": 0.0268, "step": 36000 }, { "epoch": 3.61, "grad_norm": 0.11337871849536896, "learning_rate": 4.41304347826087e-06, "loss": 0.0289, "step": 36100 }, { "epoch": 3.62, "grad_norm": 0.03623759001493454, "learning_rate": 4.411371237458194e-06, "loss": 0.0165, "step": 36200 }, { "epoch": 3.63, "grad_norm": 59.48095703125, "learning_rate": 4.4096989966555184e-06, "loss": 0.0111, "step": 36300 }, { "epoch": 3.64, "grad_norm": 0.0512019619345665, "learning_rate": 4.408026755852843e-06, "loss": 0.0289, "step": 36400 }, { "epoch": 3.65, "grad_norm": 0.004787762649357319, "learning_rate": 4.406354515050167e-06, "loss": 0.0125, "step": 36500 }, { "epoch": 3.66, "grad_norm": 0.2356896698474884, "learning_rate": 4.404682274247492e-06, "loss": 0.0203, "step": 36600 }, { "epoch": 3.67, "grad_norm": 0.07559309899806976, "learning_rate": 4.403010033444816e-06, "loss": 0.0112, "step": 36700 }, { "epoch": 3.68, "grad_norm": 0.0012310033198446035, "learning_rate": 4.401337792642141e-06, "loss": 0.0167, "step": 36800 }, { "epoch": 3.69, "grad_norm": 0.07424982637166977, "learning_rate": 4.399665551839465e-06, "loss": 0.0151, "step": 36900 }, { "epoch": 3.7, "grad_norm": 0.00213808030821383, "learning_rate": 4.397993311036789e-06, "loss": 0.0087, "step": 37000 }, { "epoch": 3.71, "grad_norm": 0.08704973757266998, "learning_rate": 4.396321070234114e-06, "loss": 0.0081, "step": 37100 }, { "epoch": 3.7199999999999998, "grad_norm": 0.0880984216928482, "learning_rate": 4.394648829431438e-06, "loss": 0.0347, "step": 37200 }, { "epoch": 3.73, "grad_norm": 0.0027694138698279858, "learning_rate": 4.392976588628763e-06, "loss": 0.0319, "step": 37300 }, { "epoch": 3.74, "grad_norm": 0.003729386255145073, "learning_rate": 4.391304347826087e-06, "loss": 0.0114, "step": 37400 }, { "epoch": 3.75, "grad_norm": 0.018702877685427666, "learning_rate": 4.389632107023412e-06, "loss": 0.0229, "step": 37500 }, { "epoch": 3.76, "grad_norm": 0.00089402956655249, "learning_rate": 4.387959866220736e-06, "loss": 0.0144, "step": 37600 }, { "epoch": 3.77, "grad_norm": 0.0016536782495677471, "learning_rate": 4.38628762541806e-06, "loss": 0.0251, "step": 37700 }, { "epoch": 3.7800000000000002, "grad_norm": 0.0014757058816030622, "learning_rate": 4.384615384615385e-06, "loss": 0.0407, "step": 37800 }, { "epoch": 3.79, "grad_norm": 0.0005884135607630014, "learning_rate": 4.38294314381271e-06, "loss": 0.0171, "step": 37900 }, { "epoch": 3.8, "grad_norm": 0.0046568759717047215, "learning_rate": 4.381270903010034e-06, "loss": 0.017, "step": 38000 }, { "epoch": 3.81, "grad_norm": 0.03397432714700699, "learning_rate": 4.379598662207358e-06, "loss": 0.0338, "step": 38100 }, { "epoch": 3.82, "grad_norm": 0.0011023873230442405, "learning_rate": 4.377926421404683e-06, "loss": 0.0124, "step": 38200 }, { "epoch": 3.83, "grad_norm": 0.002390155801549554, "learning_rate": 4.376254180602007e-06, "loss": 0.0255, "step": 38300 }, { "epoch": 3.84, "grad_norm": 16.321758270263672, "learning_rate": 4.374581939799332e-06, "loss": 0.0244, "step": 38400 }, { "epoch": 3.85, "grad_norm": 0.0222801323980093, "learning_rate": 4.372909698996656e-06, "loss": 0.0185, "step": 38500 }, { "epoch": 3.86, "grad_norm": 0.002239386085420847, "learning_rate": 4.371237458193981e-06, "loss": 0.0116, "step": 38600 }, { "epoch": 3.87, "grad_norm": 0.00036294609890319407, "learning_rate": 4.369565217391305e-06, "loss": 0.024, "step": 38700 }, { "epoch": 3.88, "grad_norm": 0.0006557508604601026, "learning_rate": 4.367892976588629e-06, "loss": 0.0166, "step": 38800 }, { "epoch": 3.89, "grad_norm": 0.0019109762506559491, "learning_rate": 4.3662207357859536e-06, "loss": 0.0123, "step": 38900 }, { "epoch": 3.9, "grad_norm": 0.0005525159067474306, "learning_rate": 4.364548494983278e-06, "loss": 0.0111, "step": 39000 }, { "epoch": 3.91, "grad_norm": 0.00040455596172250807, "learning_rate": 4.3628762541806025e-06, "loss": 0.0213, "step": 39100 }, { "epoch": 3.92, "grad_norm": 0.0005091234343126416, "learning_rate": 4.3612040133779265e-06, "loss": 0.0156, "step": 39200 }, { "epoch": 3.93, "grad_norm": 0.0005204415647312999, "learning_rate": 4.359531772575251e-06, "loss": 0.0086, "step": 39300 }, { "epoch": 3.94, "grad_norm": 22.845434188842773, "learning_rate": 4.3578595317725754e-06, "loss": 0.022, "step": 39400 }, { "epoch": 3.95, "grad_norm": 0.051367294043302536, "learning_rate": 4.3561872909698995e-06, "loss": 0.0303, "step": 39500 }, { "epoch": 3.96, "grad_norm": 48.237937927246094, "learning_rate": 4.354515050167224e-06, "loss": 0.0177, "step": 39600 }, { "epoch": 3.9699999999999998, "grad_norm": 0.00723648676648736, "learning_rate": 4.352842809364548e-06, "loss": 0.024, "step": 39700 }, { "epoch": 3.98, "grad_norm": 0.07704585045576096, "learning_rate": 4.351170568561873e-06, "loss": 0.0159, "step": 39800 }, { "epoch": 3.99, "grad_norm": 8.135265350341797, "learning_rate": 4.349498327759198e-06, "loss": 0.0237, "step": 39900 }, { "epoch": 4.0, "grad_norm": 0.006071310956031084, "learning_rate": 4.347826086956522e-06, "loss": 0.0291, "step": 40000 }, { "epoch": 4.0, "eval_accuracy": 0.985475, "eval_f1": 0.985475, "eval_loss": 0.07918225228786469, "eval_runtime": 137.0078, "eval_samples_per_second": 291.954, "eval_steps_per_second": 291.954, "step": 40000 }, { "epoch": 4.01, "grad_norm": 0.06916201114654541, "learning_rate": 4.346153846153846e-06, "loss": 0.0175, "step": 40100 }, { "epoch": 4.02, "grad_norm": 0.013865008018910885, "learning_rate": 4.34448160535117e-06, "loss": 0.0096, "step": 40200 }, { "epoch": 4.03, "grad_norm": 0.00577643932774663, "learning_rate": 4.342809364548495e-06, "loss": 0.0143, "step": 40300 }, { "epoch": 4.04, "grad_norm": 0.00038059207145124674, "learning_rate": 4.34113712374582e-06, "loss": 0.0054, "step": 40400 }, { "epoch": 4.05, "grad_norm": 0.0015450104838237166, "learning_rate": 4.339464882943144e-06, "loss": 0.0107, "step": 40500 }, { "epoch": 4.06, "grad_norm": 0.00023171660723164678, "learning_rate": 4.337792642140469e-06, "loss": 0.0055, "step": 40600 }, { "epoch": 4.07, "grad_norm": 0.00019739058916456997, "learning_rate": 4.336120401337793e-06, "loss": 0.0093, "step": 40700 }, { "epoch": 4.08, "grad_norm": 0.0005960884736850858, "learning_rate": 4.334448160535117e-06, "loss": 0.0144, "step": 40800 }, { "epoch": 4.09, "grad_norm": 0.0004620914114639163, "learning_rate": 4.332775919732442e-06, "loss": 0.0192, "step": 40900 }, { "epoch": 4.1, "grad_norm": 0.16995316743850708, "learning_rate": 4.331103678929766e-06, "loss": 0.0061, "step": 41000 }, { "epoch": 4.11, "grad_norm": 0.03042794018983841, "learning_rate": 4.329431438127091e-06, "loss": 0.0102, "step": 41100 }, { "epoch": 4.12, "grad_norm": 0.00546935573220253, "learning_rate": 4.327759197324415e-06, "loss": 0.0164, "step": 41200 }, { "epoch": 4.13, "grad_norm": 0.012843598611652851, "learning_rate": 4.32608695652174e-06, "loss": 0.0089, "step": 41300 }, { "epoch": 4.14, "grad_norm": 0.000863418448716402, "learning_rate": 4.324414715719064e-06, "loss": 0.0182, "step": 41400 }, { "epoch": 4.15, "grad_norm": 0.007233878131955862, "learning_rate": 4.322742474916388e-06, "loss": 0.0094, "step": 41500 }, { "epoch": 4.16, "grad_norm": 0.003833547467365861, "learning_rate": 4.321070234113713e-06, "loss": 0.0095, "step": 41600 }, { "epoch": 4.17, "grad_norm": 0.00022056704619899392, "learning_rate": 4.319397993311037e-06, "loss": 0.0067, "step": 41700 }, { "epoch": 4.18, "grad_norm": 0.00015522346075158566, "learning_rate": 4.317725752508362e-06, "loss": 0.0031, "step": 41800 }, { "epoch": 4.19, "grad_norm": 107.11628723144531, "learning_rate": 4.3160535117056865e-06, "loss": 0.0072, "step": 41900 }, { "epoch": 4.2, "grad_norm": 0.013127053156495094, "learning_rate": 4.3143812709030106e-06, "loss": 0.0164, "step": 42000 }, { "epoch": 4.21, "grad_norm": 0.0003005210601259023, "learning_rate": 4.312709030100335e-06, "loss": 0.0173, "step": 42100 }, { "epoch": 4.22, "grad_norm": 0.0003234084288123995, "learning_rate": 4.311036789297659e-06, "loss": 0.0122, "step": 42200 }, { "epoch": 4.23, "grad_norm": 0.1619456708431244, "learning_rate": 4.3093645484949835e-06, "loss": 0.0099, "step": 42300 }, { "epoch": 4.24, "grad_norm": 0.0008889096206985414, "learning_rate": 4.307692307692308e-06, "loss": 0.0191, "step": 42400 }, { "epoch": 4.25, "grad_norm": 0.0017285248031839728, "learning_rate": 4.3060200668896324e-06, "loss": 0.0106, "step": 42500 }, { "epoch": 4.26, "grad_norm": 0.00019850315584335476, "learning_rate": 4.304347826086957e-06, "loss": 0.0172, "step": 42600 }, { "epoch": 4.27, "grad_norm": 0.00038836541352793574, "learning_rate": 4.302675585284281e-06, "loss": 0.0152, "step": 42700 }, { "epoch": 4.28, "grad_norm": 0.00012064904876751825, "learning_rate": 4.301003344481605e-06, "loss": 0.0062, "step": 42800 }, { "epoch": 4.29, "grad_norm": 0.021381106227636337, "learning_rate": 4.29933110367893e-06, "loss": 0.0141, "step": 42900 }, { "epoch": 4.3, "grad_norm": 0.02078850567340851, "learning_rate": 4.297658862876254e-06, "loss": 0.0144, "step": 43000 }, { "epoch": 4.31, "grad_norm": 0.0006510872044600546, "learning_rate": 4.295986622073579e-06, "loss": 0.0067, "step": 43100 }, { "epoch": 4.32, "grad_norm": 0.0023323597852140665, "learning_rate": 4.294314381270903e-06, "loss": 0.0047, "step": 43200 }, { "epoch": 4.33, "grad_norm": 0.0004781058814842254, "learning_rate": 4.292642140468228e-06, "loss": 0.0063, "step": 43300 }, { "epoch": 4.34, "grad_norm": 0.0051570613868534565, "learning_rate": 4.290969899665552e-06, "loss": 0.0222, "step": 43400 }, { "epoch": 4.35, "grad_norm": 0.23171906173229218, "learning_rate": 4.289297658862876e-06, "loss": 0.0053, "step": 43500 }, { "epoch": 4.36, "grad_norm": 0.04821045324206352, "learning_rate": 4.287625418060201e-06, "loss": 0.0177, "step": 43600 }, { "epoch": 4.37, "grad_norm": 0.0022421092726290226, "learning_rate": 4.285953177257525e-06, "loss": 0.0012, "step": 43700 }, { "epoch": 4.38, "grad_norm": 0.0016597072826698422, "learning_rate": 4.28428093645485e-06, "loss": 0.0063, "step": 43800 }, { "epoch": 4.39, "grad_norm": 0.166465625166893, "learning_rate": 4.282608695652175e-06, "loss": 0.0209, "step": 43900 }, { "epoch": 4.4, "grad_norm": 0.05712735652923584, "learning_rate": 4.280936454849499e-06, "loss": 0.0171, "step": 44000 }, { "epoch": 4.41, "grad_norm": 0.0028721943963319063, "learning_rate": 4.279264214046823e-06, "loss": 0.0002, "step": 44100 }, { "epoch": 4.42, "grad_norm": 0.014931490644812584, "learning_rate": 4.277591973244147e-06, "loss": 0.0155, "step": 44200 }, { "epoch": 4.43, "grad_norm": 0.0601530522108078, "learning_rate": 4.275919732441472e-06, "loss": 0.0048, "step": 44300 }, { "epoch": 4.44, "grad_norm": 0.00023735656577628106, "learning_rate": 4.274247491638797e-06, "loss": 0.0067, "step": 44400 }, { "epoch": 4.45, "grad_norm": 0.00011775241728173569, "learning_rate": 4.272575250836121e-06, "loss": 0.0102, "step": 44500 }, { "epoch": 4.46, "grad_norm": 0.0006663117092102766, "learning_rate": 4.270903010033446e-06, "loss": 0.0159, "step": 44600 }, { "epoch": 4.47, "grad_norm": 0.0003513609990477562, "learning_rate": 4.26923076923077e-06, "loss": 0.0035, "step": 44700 }, { "epoch": 4.48, "grad_norm": 0.00022971085854806006, "learning_rate": 4.267558528428094e-06, "loss": 0.0238, "step": 44800 }, { "epoch": 4.49, "grad_norm": 0.0004504798853304237, "learning_rate": 4.265886287625419e-06, "loss": 0.012, "step": 44900 }, { "epoch": 4.5, "grad_norm": 0.00031274466891773045, "learning_rate": 4.264214046822743e-06, "loss": 0.0194, "step": 45000 }, { "epoch": 4.5, "eval_accuracy": 0.98525, "eval_f1": 0.98525, "eval_loss": 0.10297723114490509, "eval_runtime": 138.583, "eval_samples_per_second": 288.636, "eval_steps_per_second": 288.636, "step": 45000 }, { "epoch": 4.51, "grad_norm": 0.0387171134352684, "learning_rate": 4.2625418060200676e-06, "loss": 0.0134, "step": 45100 }, { "epoch": 4.52, "grad_norm": 0.00037836728733964264, "learning_rate": 4.260869565217392e-06, "loss": 0.0142, "step": 45200 }, { "epoch": 4.53, "grad_norm": 0.04093967378139496, "learning_rate": 4.2591973244147165e-06, "loss": 0.0105, "step": 45300 }, { "epoch": 4.54, "grad_norm": 0.0007085870602168143, "learning_rate": 4.2575250836120405e-06, "loss": 0.0196, "step": 45400 }, { "epoch": 4.55, "grad_norm": 0.003028183476999402, "learning_rate": 4.2558528428093646e-06, "loss": 0.005, "step": 45500 }, { "epoch": 4.5600000000000005, "grad_norm": 0.019922267645597458, "learning_rate": 4.2541806020066895e-06, "loss": 0.0144, "step": 45600 }, { "epoch": 4.57, "grad_norm": 0.0007607261650264263, "learning_rate": 4.2525083612040135e-06, "loss": 0.0121, "step": 45700 }, { "epoch": 4.58, "grad_norm": 0.0019423003541305661, "learning_rate": 4.250836120401338e-06, "loss": 0.0075, "step": 45800 }, { "epoch": 4.59, "grad_norm": 0.0002597976417746395, "learning_rate": 4.249163879598662e-06, "loss": 0.0077, "step": 45900 }, { "epoch": 4.6, "grad_norm": 0.0011591215152293444, "learning_rate": 4.247491638795987e-06, "loss": 0.0206, "step": 46000 }, { "epoch": 4.61, "grad_norm": 0.02335204742848873, "learning_rate": 4.245819397993311e-06, "loss": 0.0175, "step": 46100 }, { "epoch": 4.62, "grad_norm": 0.004985714331269264, "learning_rate": 4.244147157190635e-06, "loss": 0.0134, "step": 46200 }, { "epoch": 4.63, "grad_norm": 0.00042676858720369637, "learning_rate": 4.24247491638796e-06, "loss": 0.0248, "step": 46300 }, { "epoch": 4.64, "grad_norm": 0.03829636424779892, "learning_rate": 4.240802675585284e-06, "loss": 0.0034, "step": 46400 }, { "epoch": 4.65, "grad_norm": 0.0008630960946902633, "learning_rate": 4.239130434782609e-06, "loss": 0.0099, "step": 46500 }, { "epoch": 4.66, "grad_norm": 16.075529098510742, "learning_rate": 4.237458193979934e-06, "loss": 0.0116, "step": 46600 }, { "epoch": 4.67, "grad_norm": 0.10533302277326584, "learning_rate": 4.235785953177258e-06, "loss": 0.0092, "step": 46700 }, { "epoch": 4.68, "grad_norm": 0.01659245416522026, "learning_rate": 4.234113712374582e-06, "loss": 0.0272, "step": 46800 }, { "epoch": 4.6899999999999995, "grad_norm": 0.043937236070632935, "learning_rate": 4.232441471571906e-06, "loss": 0.0188, "step": 46900 }, { "epoch": 4.7, "grad_norm": 0.00019682837591972202, "learning_rate": 4.230769230769231e-06, "loss": 0.0132, "step": 47000 }, { "epoch": 4.71, "grad_norm": 0.001616219524294138, "learning_rate": 4.229096989966556e-06, "loss": 0.0112, "step": 47100 }, { "epoch": 4.72, "grad_norm": 0.004383947234600782, "learning_rate": 4.22742474916388e-06, "loss": 0.0096, "step": 47200 }, { "epoch": 4.73, "grad_norm": 0.003946308046579361, "learning_rate": 4.225752508361204e-06, "loss": 0.0126, "step": 47300 }, { "epoch": 4.74, "grad_norm": 0.010722492821514606, "learning_rate": 4.224080267558529e-06, "loss": 0.0072, "step": 47400 }, { "epoch": 4.75, "grad_norm": 0.0002598193532321602, "learning_rate": 4.222408026755853e-06, "loss": 0.008, "step": 47500 }, { "epoch": 4.76, "grad_norm": 0.03757161274552345, "learning_rate": 4.220735785953178e-06, "loss": 0.0086, "step": 47600 }, { "epoch": 4.77, "grad_norm": 0.0003191785654053092, "learning_rate": 4.219063545150502e-06, "loss": 0.0242, "step": 47700 }, { "epoch": 4.78, "grad_norm": 0.006073308642953634, "learning_rate": 4.217391304347827e-06, "loss": 0.0075, "step": 47800 }, { "epoch": 4.79, "grad_norm": 0.00026708669611252844, "learning_rate": 4.215719063545151e-06, "loss": 0.0121, "step": 47900 }, { "epoch": 4.8, "grad_norm": 0.0015478582354262471, "learning_rate": 4.214046822742475e-06, "loss": 0.0116, "step": 48000 }, { "epoch": 4.8100000000000005, "grad_norm": 0.0008126681204885244, "learning_rate": 4.2123745819398e-06, "loss": 0.0063, "step": 48100 }, { "epoch": 4.82, "grad_norm": 0.05607163533568382, "learning_rate": 4.210702341137124e-06, "loss": 0.0102, "step": 48200 }, { "epoch": 4.83, "grad_norm": 0.00012003448500763625, "learning_rate": 4.209030100334449e-06, "loss": 0.0149, "step": 48300 }, { "epoch": 4.84, "grad_norm": 1.0828863382339478, "learning_rate": 4.207357859531773e-06, "loss": 0.012, "step": 48400 }, { "epoch": 4.85, "grad_norm": 0.1325209140777588, "learning_rate": 4.2056856187290975e-06, "loss": 0.0247, "step": 48500 }, { "epoch": 4.86, "grad_norm": 0.01746418885886669, "learning_rate": 4.2040133779264216e-06, "loss": 0.0053, "step": 48600 }, { "epoch": 4.87, "grad_norm": 0.004969676490873098, "learning_rate": 4.202341137123746e-06, "loss": 0.0059, "step": 48700 }, { "epoch": 4.88, "grad_norm": 0.00017602667503524572, "learning_rate": 4.2006688963210705e-06, "loss": 0.0169, "step": 48800 }, { "epoch": 4.89, "grad_norm": 0.0011687574442476034, "learning_rate": 4.1989966555183945e-06, "loss": 0.0098, "step": 48900 }, { "epoch": 4.9, "grad_norm": 0.008183939382433891, "learning_rate": 4.197324414715719e-06, "loss": 0.01, "step": 49000 }, { "epoch": 4.91, "grad_norm": 0.0027384813874959946, "learning_rate": 4.195652173913044e-06, "loss": 0.0166, "step": 49100 }, { "epoch": 4.92, "grad_norm": 0.0357474759221077, "learning_rate": 4.193979933110368e-06, "loss": 0.0113, "step": 49200 }, { "epoch": 4.93, "grad_norm": 0.030190743505954742, "learning_rate": 4.192307692307692e-06, "loss": 0.0203, "step": 49300 }, { "epoch": 4.9399999999999995, "grad_norm": 0.026387318968772888, "learning_rate": 4.190635451505016e-06, "loss": 0.0165, "step": 49400 }, { "epoch": 4.95, "grad_norm": 0.0477469228208065, "learning_rate": 4.188963210702341e-06, "loss": 0.0038, "step": 49500 }, { "epoch": 4.96, "grad_norm": 0.009463177062571049, "learning_rate": 4.187290969899666e-06, "loss": 0.0194, "step": 49600 }, { "epoch": 4.97, "grad_norm": 0.001094246399588883, "learning_rate": 4.18561872909699e-06, "loss": 0.0016, "step": 49700 }, { "epoch": 4.98, "grad_norm": 0.0002311818243470043, "learning_rate": 4.183946488294315e-06, "loss": 0.0061, "step": 49800 }, { "epoch": 4.99, "grad_norm": 0.0019176400965079665, "learning_rate": 4.182274247491639e-06, "loss": 0.0117, "step": 49900 }, { "epoch": 5.0, "grad_norm": 0.00022912977146916091, "learning_rate": 4.180602006688963e-06, "loss": 0.0109, "step": 50000 }, { "epoch": 5.0, "eval_accuracy": 0.985925, "eval_f1": 0.985925, "eval_loss": 0.10268224030733109, "eval_runtime": 137.6245, "eval_samples_per_second": 290.646, "eval_steps_per_second": 290.646, "step": 50000 }, { "epoch": 5.01, "grad_norm": 8.346785034518689e-05, "learning_rate": 4.178929765886288e-06, "loss": 0.0015, "step": 50100 }, { "epoch": 5.02, "grad_norm": 0.000289654330117628, "learning_rate": 4.177257525083612e-06, "loss": 0.0, "step": 50200 }, { "epoch": 5.03, "grad_norm": 0.0004644142754841596, "learning_rate": 4.175585284280937e-06, "loss": 0.0089, "step": 50300 }, { "epoch": 5.04, "grad_norm": 0.0007376486901193857, "learning_rate": 4.173913043478261e-06, "loss": 0.0098, "step": 50400 }, { "epoch": 5.05, "grad_norm": 0.00011336214083712548, "learning_rate": 4.172240802675586e-06, "loss": 0.0076, "step": 50500 }, { "epoch": 5.06, "grad_norm": 0.002228671684861183, "learning_rate": 4.17056856187291e-06, "loss": 0.0089, "step": 50600 }, { "epoch": 5.07, "grad_norm": 0.005318440962582827, "learning_rate": 4.168896321070234e-06, "loss": 0.0016, "step": 50700 }, { "epoch": 5.08, "grad_norm": 0.013958403840661049, "learning_rate": 4.167224080267559e-06, "loss": 0.0041, "step": 50800 }, { "epoch": 5.09, "grad_norm": 0.00047520818770863116, "learning_rate": 4.165551839464883e-06, "loss": 0.0043, "step": 50900 }, { "epoch": 5.1, "grad_norm": 27.530658721923828, "learning_rate": 4.163879598662208e-06, "loss": 0.0121, "step": 51000 }, { "epoch": 5.11, "grad_norm": 0.26363804936408997, "learning_rate": 4.162207357859533e-06, "loss": 0.0086, "step": 51100 }, { "epoch": 5.12, "grad_norm": 0.0017211829544976354, "learning_rate": 4.160535117056857e-06, "loss": 0.0134, "step": 51200 }, { "epoch": 5.13, "grad_norm": 0.0003323334385640919, "learning_rate": 4.158862876254181e-06, "loss": 0.0161, "step": 51300 }, { "epoch": 5.14, "grad_norm": 0.023742739111185074, "learning_rate": 4.157190635451505e-06, "loss": 0.0076, "step": 51400 }, { "epoch": 5.15, "grad_norm": 0.0006456200499087572, "learning_rate": 4.15551839464883e-06, "loss": 0.0158, "step": 51500 }, { "epoch": 5.16, "grad_norm": 0.23926404118537903, "learning_rate": 4.1538461538461545e-06, "loss": 0.012, "step": 51600 }, { "epoch": 5.17, "grad_norm": 0.41261881589889526, "learning_rate": 4.1521739130434786e-06, "loss": 0.0002, "step": 51700 }, { "epoch": 5.18, "grad_norm": 0.022988926619291306, "learning_rate": 4.1505016722408035e-06, "loss": 0.0173, "step": 51800 }, { "epoch": 5.19, "grad_norm": 0.00015973170229699463, "learning_rate": 4.1488294314381275e-06, "loss": 0.0019, "step": 51900 }, { "epoch": 5.2, "grad_norm": 0.00015827955212444067, "learning_rate": 4.1471571906354515e-06, "loss": 0.014, "step": 52000 }, { "epoch": 5.21, "grad_norm": 0.0023194041568785906, "learning_rate": 4.145484949832776e-06, "loss": 0.004, "step": 52100 }, { "epoch": 5.22, "grad_norm": 0.0071893418207764626, "learning_rate": 4.1438127090301005e-06, "loss": 0.0046, "step": 52200 }, { "epoch": 5.23, "grad_norm": 0.0016534804599359632, "learning_rate": 4.142140468227425e-06, "loss": 0.0043, "step": 52300 }, { "epoch": 5.24, "grad_norm": 0.0004097437486052513, "learning_rate": 4.140468227424749e-06, "loss": 0.0213, "step": 52400 }, { "epoch": 5.25, "grad_norm": 0.004914246965199709, "learning_rate": 4.138795986622074e-06, "loss": 0.0075, "step": 52500 }, { "epoch": 5.26, "grad_norm": 0.0005830037407577038, "learning_rate": 4.137123745819398e-06, "loss": 0.0116, "step": 52600 }, { "epoch": 5.27, "grad_norm": 0.0004236988606862724, "learning_rate": 4.135451505016722e-06, "loss": 0.0054, "step": 52700 }, { "epoch": 5.28, "grad_norm": 0.05537344887852669, "learning_rate": 4.133779264214047e-06, "loss": 0.0069, "step": 52800 }, { "epoch": 5.29, "grad_norm": 5.937612513662316e-05, "learning_rate": 4.132107023411371e-06, "loss": 0.0053, "step": 52900 }, { "epoch": 5.3, "grad_norm": 0.04086212441325188, "learning_rate": 4.130434782608696e-06, "loss": 0.006, "step": 53000 }, { "epoch": 5.31, "grad_norm": 6.80720477248542e-05, "learning_rate": 4.128762541806021e-06, "loss": 0.0087, "step": 53100 }, { "epoch": 5.32, "grad_norm": 0.11411894112825394, "learning_rate": 4.127090301003345e-06, "loss": 0.0055, "step": 53200 }, { "epoch": 5.33, "grad_norm": 0.0021275030449032784, "learning_rate": 4.125418060200669e-06, "loss": 0.0112, "step": 53300 }, { "epoch": 5.34, "grad_norm": 0.0030677011236548424, "learning_rate": 4.123745819397993e-06, "loss": 0.0049, "step": 53400 }, { "epoch": 5.35, "grad_norm": 0.0002973228693008423, "learning_rate": 4.122073578595318e-06, "loss": 0.0083, "step": 53500 }, { "epoch": 5.36, "grad_norm": 0.0001635253574931994, "learning_rate": 4.120401337792643e-06, "loss": 0.0115, "step": 53600 }, { "epoch": 5.37, "grad_norm": 96.96515655517578, "learning_rate": 4.118729096989967e-06, "loss": 0.0046, "step": 53700 }, { "epoch": 5.38, "grad_norm": 0.007858989760279655, "learning_rate": 4.117056856187292e-06, "loss": 0.0127, "step": 53800 }, { "epoch": 5.39, "grad_norm": 0.0028765483293682337, "learning_rate": 4.115384615384616e-06, "loss": 0.0014, "step": 53900 }, { "epoch": 5.4, "grad_norm": 0.0008154021925292909, "learning_rate": 4.11371237458194e-06, "loss": 0.0077, "step": 54000 }, { "epoch": 5.41, "grad_norm": 0.001322218799032271, "learning_rate": 4.112040133779265e-06, "loss": 0.0056, "step": 54100 }, { "epoch": 5.42, "grad_norm": 0.023289771750569344, "learning_rate": 4.110367892976589e-06, "loss": 0.0156, "step": 54200 }, { "epoch": 5.43, "grad_norm": 0.005015732254832983, "learning_rate": 4.108695652173914e-06, "loss": 0.0099, "step": 54300 }, { "epoch": 5.44, "grad_norm": 7.15251371730119e-05, "learning_rate": 4.107023411371238e-06, "loss": 0.0184, "step": 54400 }, { "epoch": 5.45, "grad_norm": 0.00010623384878272191, "learning_rate": 4.105351170568563e-06, "loss": 0.0032, "step": 54500 }, { "epoch": 5.46, "grad_norm": 0.0029225721955299377, "learning_rate": 4.103678929765887e-06, "loss": 0.0112, "step": 54600 }, { "epoch": 5.47, "grad_norm": 0.000662469130475074, "learning_rate": 4.102006688963211e-06, "loss": 0.0089, "step": 54700 }, { "epoch": 5.48, "grad_norm": 0.0004447357205208391, "learning_rate": 4.100334448160536e-06, "loss": 0.0095, "step": 54800 }, { "epoch": 5.49, "grad_norm": 0.0009589354740455747, "learning_rate": 4.09866220735786e-06, "loss": 0.0151, "step": 54900 }, { "epoch": 5.5, "grad_norm": 0.002127012936398387, "learning_rate": 4.0969899665551845e-06, "loss": 0.0075, "step": 55000 }, { "epoch": 5.5, "eval_accuracy": 0.98495, "eval_f1": 0.98495, "eval_loss": 0.11519099026918411, "eval_runtime": 135.3873, "eval_samples_per_second": 295.449, "eval_steps_per_second": 295.449, "step": 55000 }, { "epoch": 5.51, "grad_norm": 0.00014911459584254771, "learning_rate": 4.095317725752509e-06, "loss": 0.0086, "step": 55100 }, { "epoch": 5.52, "grad_norm": 0.00037860465818084776, "learning_rate": 4.093645484949833e-06, "loss": 0.0072, "step": 55200 }, { "epoch": 5.53, "grad_norm": 9.189704724121839e-05, "learning_rate": 4.0919732441471575e-06, "loss": 0.0122, "step": 55300 }, { "epoch": 5.54, "grad_norm": 0.0012258023489266634, "learning_rate": 4.0903010033444815e-06, "loss": 0.0046, "step": 55400 }, { "epoch": 5.55, "grad_norm": 0.0069747380912303925, "learning_rate": 4.088628762541806e-06, "loss": 0.0097, "step": 55500 }, { "epoch": 5.5600000000000005, "grad_norm": 0.00017239715089090168, "learning_rate": 4.086956521739131e-06, "loss": 0.0009, "step": 55600 }, { "epoch": 5.57, "grad_norm": 0.0012025295291095972, "learning_rate": 4.085284280936455e-06, "loss": 0.0111, "step": 55700 }, { "epoch": 5.58, "grad_norm": 0.07929231971502304, "learning_rate": 4.08361204013378e-06, "loss": 0.0067, "step": 55800 }, { "epoch": 5.59, "grad_norm": 0.0003187973634339869, "learning_rate": 4.081939799331104e-06, "loss": 0.0157, "step": 55900 }, { "epoch": 5.6, "grad_norm": 0.0013896484160795808, "learning_rate": 4.080267558528428e-06, "loss": 0.0059, "step": 56000 }, { "epoch": 5.61, "grad_norm": 7.541276863776147e-05, "learning_rate": 4.078595317725752e-06, "loss": 0.0083, "step": 56100 }, { "epoch": 5.62, "grad_norm": 0.005246707238256931, "learning_rate": 4.076923076923077e-06, "loss": 0.0177, "step": 56200 }, { "epoch": 5.63, "grad_norm": 0.0028932797722518444, "learning_rate": 4.075250836120402e-06, "loss": 0.0053, "step": 56300 }, { "epoch": 5.64, "grad_norm": 0.0031544279772788286, "learning_rate": 4.073578595317726e-06, "loss": 0.0008, "step": 56400 }, { "epoch": 5.65, "grad_norm": 5.287326712277718e-05, "learning_rate": 4.071906354515051e-06, "loss": 0.0048, "step": 56500 }, { "epoch": 5.66, "grad_norm": 6.959833262953907e-05, "learning_rate": 4.070234113712375e-06, "loss": 0.0064, "step": 56600 }, { "epoch": 5.67, "grad_norm": 0.002718166681006551, "learning_rate": 4.068561872909699e-06, "loss": 0.0151, "step": 56700 }, { "epoch": 5.68, "grad_norm": 3.8481392860412598, "learning_rate": 4.066889632107024e-06, "loss": 0.0099, "step": 56800 }, { "epoch": 5.6899999999999995, "grad_norm": 0.00023723083722870797, "learning_rate": 4.065217391304348e-06, "loss": 0.0001, "step": 56900 }, { "epoch": 5.7, "grad_norm": 0.003258422249928117, "learning_rate": 4.063545150501673e-06, "loss": 0.0088, "step": 57000 }, { "epoch": 5.71, "grad_norm": 8.750943379709497e-05, "learning_rate": 4.061872909698997e-06, "loss": 0.0059, "step": 57100 }, { "epoch": 5.72, "grad_norm": 0.04691299796104431, "learning_rate": 4.060200668896322e-06, "loss": 0.0164, "step": 57200 }, { "epoch": 5.73, "grad_norm": 0.002486943267285824, "learning_rate": 4.058528428093646e-06, "loss": 0.021, "step": 57300 }, { "epoch": 5.74, "grad_norm": 0.00018505608022678643, "learning_rate": 4.05685618729097e-06, "loss": 0.004, "step": 57400 }, { "epoch": 5.75, "grad_norm": 0.0130680613219738, "learning_rate": 4.055183946488295e-06, "loss": 0.0038, "step": 57500 }, { "epoch": 5.76, "grad_norm": 0.002892815973609686, "learning_rate": 4.053511705685619e-06, "loss": 0.0094, "step": 57600 }, { "epoch": 5.77, "grad_norm": 0.0004248152836225927, "learning_rate": 4.051839464882944e-06, "loss": 0.017, "step": 57700 }, { "epoch": 5.78, "grad_norm": 0.024957746267318726, "learning_rate": 4.050167224080268e-06, "loss": 0.009, "step": 57800 }, { "epoch": 5.79, "grad_norm": 5.579432036029175e-05, "learning_rate": 4.048494983277593e-06, "loss": 0.0113, "step": 57900 }, { "epoch": 5.8, "grad_norm": 0.0788150206208229, "learning_rate": 4.046822742474917e-06, "loss": 0.0161, "step": 58000 }, { "epoch": 5.8100000000000005, "grad_norm": 0.00016917276661843061, "learning_rate": 4.045150501672241e-06, "loss": 0.0093, "step": 58100 }, { "epoch": 5.82, "grad_norm": 0.0006015472463332117, "learning_rate": 4.0434782608695655e-06, "loss": 0.0009, "step": 58200 }, { "epoch": 5.83, "grad_norm": 0.0022845177445560694, "learning_rate": 4.0418060200668904e-06, "loss": 0.0085, "step": 58300 }, { "epoch": 5.84, "grad_norm": 0.00025827871286310256, "learning_rate": 4.0401337792642145e-06, "loss": 0.0075, "step": 58400 }, { "epoch": 5.85, "grad_norm": 0.00020007445709779859, "learning_rate": 4.0384615384615385e-06, "loss": 0.0113, "step": 58500 }, { "epoch": 5.86, "grad_norm": 4.889999036095105e-05, "learning_rate": 4.036789297658863e-06, "loss": 0.0092, "step": 58600 }, { "epoch": 5.87, "grad_norm": 13.686702728271484, "learning_rate": 4.035117056856187e-06, "loss": 0.0077, "step": 58700 }, { "epoch": 5.88, "grad_norm": 0.0002892339834943414, "learning_rate": 4.033444816053512e-06, "loss": 0.003, "step": 58800 }, { "epoch": 5.89, "grad_norm": 0.003974799066781998, "learning_rate": 4.031772575250836e-06, "loss": 0.0088, "step": 58900 }, { "epoch": 5.9, "grad_norm": 0.010556302964687347, "learning_rate": 4.030100334448161e-06, "loss": 0.0171, "step": 59000 }, { "epoch": 5.91, "grad_norm": 0.02879416197538376, "learning_rate": 4.028428093645485e-06, "loss": 0.0168, "step": 59100 }, { "epoch": 5.92, "grad_norm": 4.420049299369566e-05, "learning_rate": 4.026755852842809e-06, "loss": 0.0035, "step": 59200 }, { "epoch": 5.93, "grad_norm": 0.0005424703704193234, "learning_rate": 4.025083612040134e-06, "loss": 0.0105, "step": 59300 }, { "epoch": 5.9399999999999995, "grad_norm": 0.07232003659009933, "learning_rate": 4.023411371237458e-06, "loss": 0.0087, "step": 59400 }, { "epoch": 5.95, "grad_norm": 0.00023946212604641914, "learning_rate": 4.021739130434783e-06, "loss": 0.0001, "step": 59500 }, { "epoch": 5.96, "grad_norm": 0.009542220272123814, "learning_rate": 4.020066889632107e-06, "loss": 0.0151, "step": 59600 }, { "epoch": 5.97, "grad_norm": 0.0009311602334491909, "learning_rate": 4.018394648829432e-06, "loss": 0.0074, "step": 59700 }, { "epoch": 5.98, "grad_norm": 0.0011114083463326097, "learning_rate": 4.016722408026756e-06, "loss": 0.0108, "step": 59800 }, { "epoch": 5.99, "grad_norm": 0.0005047828890383244, "learning_rate": 4.01505016722408e-06, "loss": 0.0031, "step": 59900 }, { "epoch": 6.0, "grad_norm": 0.042260460555553436, "learning_rate": 4.013377926421405e-06, "loss": 0.0278, "step": 60000 }, { "epoch": 6.0, "eval_accuracy": 0.986125, "eval_f1": 0.986125, "eval_loss": 0.09157590568065643, "eval_runtime": 134.4638, "eval_samples_per_second": 297.478, "eval_steps_per_second": 297.478, "step": 60000 }, { "epoch": 6.01, "grad_norm": 20.1020565032959, "learning_rate": 4.011705685618729e-06, "loss": 0.0068, "step": 60100 }, { "epoch": 6.02, "grad_norm": 0.0018263482488691807, "learning_rate": 4.010033444816054e-06, "loss": 0.0064, "step": 60200 }, { "epoch": 6.03, "grad_norm": 0.28091853857040405, "learning_rate": 4.008361204013379e-06, "loss": 0.006, "step": 60300 }, { "epoch": 6.04, "grad_norm": 0.018621103838086128, "learning_rate": 4.006688963210703e-06, "loss": 0.0027, "step": 60400 }, { "epoch": 6.05, "grad_norm": 9.468239295529202e-05, "learning_rate": 4.005016722408027e-06, "loss": 0.0043, "step": 60500 }, { "epoch": 6.06, "grad_norm": 0.005317880306392908, "learning_rate": 4.003344481605351e-06, "loss": 0.0029, "step": 60600 }, { "epoch": 6.07, "grad_norm": 0.0015003898879513144, "learning_rate": 4.001672240802676e-06, "loss": 0.0107, "step": 60700 }, { "epoch": 6.08, "grad_norm": 0.6607133746147156, "learning_rate": 4.000000000000001e-06, "loss": 0.0028, "step": 60800 }, { "epoch": 6.09, "grad_norm": 0.014779971912503242, "learning_rate": 3.998327759197325e-06, "loss": 0.0001, "step": 60900 }, { "epoch": 6.1, "grad_norm": 0.0003267451247666031, "learning_rate": 3.99665551839465e-06, "loss": 0.0068, "step": 61000 }, { "epoch": 6.11, "grad_norm": 4.1315361158922315e-05, "learning_rate": 3.994983277591974e-06, "loss": 0.0016, "step": 61100 }, { "epoch": 6.12, "grad_norm": 6.84654587530531e-05, "learning_rate": 3.993311036789298e-06, "loss": 0.0061, "step": 61200 }, { "epoch": 6.13, "grad_norm": 0.000273578567430377, "learning_rate": 3.9916387959866225e-06, "loss": 0.0049, "step": 61300 }, { "epoch": 6.14, "grad_norm": 4.0936502045951784e-05, "learning_rate": 3.989966555183947e-06, "loss": 0.0069, "step": 61400 }, { "epoch": 6.15, "grad_norm": 0.0004429342516232282, "learning_rate": 3.9882943143812715e-06, "loss": 0.0094, "step": 61500 }, { "epoch": 6.16, "grad_norm": 0.00038169074105098844, "learning_rate": 3.9866220735785955e-06, "loss": 0.0048, "step": 61600 }, { "epoch": 6.17, "grad_norm": 0.10792536288499832, "learning_rate": 3.98494983277592e-06, "loss": 0.0049, "step": 61700 }, { "epoch": 6.18, "grad_norm": 4.4029828131897375e-05, "learning_rate": 3.9832775919732444e-06, "loss": 0.0001, "step": 61800 }, { "epoch": 6.19, "grad_norm": 4.2638963350327685e-05, "learning_rate": 3.9816053511705685e-06, "loss": 0.0044, "step": 61900 }, { "epoch": 6.2, "grad_norm": 3.456565536907874e-05, "learning_rate": 3.979933110367893e-06, "loss": 0.0003, "step": 62000 }, { "epoch": 6.21, "grad_norm": 0.0010728957131505013, "learning_rate": 3.978260869565217e-06, "loss": 0.0042, "step": 62100 }, { "epoch": 6.22, "grad_norm": 5.354998938855715e-05, "learning_rate": 3.976588628762542e-06, "loss": 0.0081, "step": 62200 }, { "epoch": 6.23, "grad_norm": 35.901885986328125, "learning_rate": 3.974916387959867e-06, "loss": 0.0096, "step": 62300 }, { "epoch": 6.24, "grad_norm": 0.00024061814474407583, "learning_rate": 3.973244147157191e-06, "loss": 0.0032, "step": 62400 }, { "epoch": 6.25, "grad_norm": 3.811920032603666e-05, "learning_rate": 3.971571906354515e-06, "loss": 0.0049, "step": 62500 }, { "epoch": 6.26, "grad_norm": 2.6195273399353027, "learning_rate": 3.969899665551839e-06, "loss": 0.0029, "step": 62600 }, { "epoch": 6.27, "grad_norm": 0.009201622568070889, "learning_rate": 3.968227424749164e-06, "loss": 0.0035, "step": 62700 }, { "epoch": 6.28, "grad_norm": 9.226119436789304e-05, "learning_rate": 3.966555183946489e-06, "loss": 0.0085, "step": 62800 }, { "epoch": 6.29, "grad_norm": 4.603072375175543e-05, "learning_rate": 3.964882943143813e-06, "loss": 0.0094, "step": 62900 }, { "epoch": 6.3, "grad_norm": 2.712052628339734e-05, "learning_rate": 3.963210702341138e-06, "loss": 0.0, "step": 63000 }, { "epoch": 6.31, "grad_norm": 0.00019533267186488956, "learning_rate": 3.961538461538462e-06, "loss": 0.0105, "step": 63100 }, { "epoch": 6.32, "grad_norm": 7.596238719997928e-05, "learning_rate": 3.959866220735786e-06, "loss": 0.0136, "step": 63200 }, { "epoch": 6.33, "grad_norm": 0.0005583142628893256, "learning_rate": 3.958193979933111e-06, "loss": 0.0083, "step": 63300 }, { "epoch": 6.34, "grad_norm": 0.001194102456793189, "learning_rate": 3.956521739130435e-06, "loss": 0.0098, "step": 63400 }, { "epoch": 6.35, "grad_norm": 3.787357491091825e-05, "learning_rate": 3.95484949832776e-06, "loss": 0.0036, "step": 63500 }, { "epoch": 6.36, "grad_norm": 7.270895730471238e-05, "learning_rate": 3.953177257525084e-06, "loss": 0.0067, "step": 63600 }, { "epoch": 6.37, "grad_norm": 0.000623897067271173, "learning_rate": 3.951505016722409e-06, "loss": 0.0004, "step": 63700 }, { "epoch": 6.38, "grad_norm": 0.0003589978732634336, "learning_rate": 3.949832775919733e-06, "loss": 0.0042, "step": 63800 }, { "epoch": 6.39, "grad_norm": 0.0001246546598849818, "learning_rate": 3.948160535117057e-06, "loss": 0.011, "step": 63900 }, { "epoch": 6.4, "grad_norm": 0.008183490484952927, "learning_rate": 3.946488294314382e-06, "loss": 0.0064, "step": 64000 }, { "epoch": 6.41, "grad_norm": 0.05226984620094299, "learning_rate": 3.944816053511706e-06, "loss": 0.0125, "step": 64100 }, { "epoch": 6.42, "grad_norm": 7.40212417440489e-05, "learning_rate": 3.943143812709031e-06, "loss": 0.0059, "step": 64200 }, { "epoch": 6.43, "grad_norm": 0.00734880194067955, "learning_rate": 3.9414715719063555e-06, "loss": 0.0001, "step": 64300 }, { "epoch": 6.44, "grad_norm": 4.586549403029494e-05, "learning_rate": 3.9397993311036795e-06, "loss": 0.0056, "step": 64400 }, { "epoch": 6.45, "grad_norm": 3.2639989512972534e-05, "learning_rate": 3.938127090301004e-06, "loss": 0.0103, "step": 64500 }, { "epoch": 6.46, "grad_norm": 0.004328950308263302, "learning_rate": 3.936454849498328e-06, "loss": 0.0163, "step": 64600 }, { "epoch": 6.47, "grad_norm": 3.590931373764761e-05, "learning_rate": 3.9347826086956525e-06, "loss": 0.0048, "step": 64700 }, { "epoch": 6.48, "grad_norm": 0.0038940191734582186, "learning_rate": 3.933110367892977e-06, "loss": 0.0091, "step": 64800 }, { "epoch": 6.49, "grad_norm": 0.00018863222794607282, "learning_rate": 3.9314381270903014e-06, "loss": 0.0114, "step": 64900 }, { "epoch": 6.5, "grad_norm": 5.312537177815102e-05, "learning_rate": 3.929765886287626e-06, "loss": 0.0023, "step": 65000 }, { "epoch": 6.5, "eval_accuracy": 0.985975, "eval_f1": 0.985975, "eval_loss": 0.11558345705270767, "eval_runtime": 133.3886, "eval_samples_per_second": 299.876, "eval_steps_per_second": 299.876, "step": 65000 }, { "epoch": 6.51, "grad_norm": 0.0002164523903047666, "learning_rate": 3.92809364548495e-06, "loss": 0.0073, "step": 65100 }, { "epoch": 6.52, "grad_norm": 0.00015249662101268768, "learning_rate": 3.926421404682274e-06, "loss": 0.0034, "step": 65200 }, { "epoch": 6.53, "grad_norm": 0.0001434876030543819, "learning_rate": 3.924749163879599e-06, "loss": 0.0081, "step": 65300 }, { "epoch": 6.54, "grad_norm": 2.771373874566052e-05, "learning_rate": 3.923076923076923e-06, "loss": 0.0087, "step": 65400 }, { "epoch": 6.55, "grad_norm": 3.081748218392022e-05, "learning_rate": 3.921404682274248e-06, "loss": 0.0086, "step": 65500 }, { "epoch": 6.5600000000000005, "grad_norm": 3.04588920698734e-05, "learning_rate": 3.919732441471572e-06, "loss": 0.0065, "step": 65600 }, { "epoch": 6.57, "grad_norm": 0.00013289268827065825, "learning_rate": 3.918060200668897e-06, "loss": 0.0029, "step": 65700 }, { "epoch": 6.58, "grad_norm": 0.0002527164469938725, "learning_rate": 3.916387959866221e-06, "loss": 0.0096, "step": 65800 }, { "epoch": 6.59, "grad_norm": 0.006434381008148193, "learning_rate": 3.914715719063545e-06, "loss": 0.0066, "step": 65900 }, { "epoch": 6.6, "grad_norm": 0.001144530950114131, "learning_rate": 3.91304347826087e-06, "loss": 0.0038, "step": 66000 }, { "epoch": 6.61, "grad_norm": 0.030533278360962868, "learning_rate": 3.911371237458194e-06, "loss": 0.0033, "step": 66100 }, { "epoch": 6.62, "grad_norm": 9.253934695152566e-05, "learning_rate": 3.909698996655519e-06, "loss": 0.0056, "step": 66200 }, { "epoch": 6.63, "grad_norm": 0.0017731015104800463, "learning_rate": 3.908026755852843e-06, "loss": 0.0, "step": 66300 }, { "epoch": 6.64, "grad_norm": 6.34994066786021e-05, "learning_rate": 3.906354515050168e-06, "loss": 0.0115, "step": 66400 }, { "epoch": 6.65, "grad_norm": 3.8429308915510774e-05, "learning_rate": 3.904682274247492e-06, "loss": 0.0097, "step": 66500 }, { "epoch": 6.66, "grad_norm": 0.0302902702242136, "learning_rate": 3.903010033444816e-06, "loss": 0.0115, "step": 66600 }, { "epoch": 6.67, "grad_norm": 0.00047553866170346737, "learning_rate": 3.901337792642141e-06, "loss": 0.0061, "step": 66700 }, { "epoch": 6.68, "grad_norm": 3.929653757950291e-05, "learning_rate": 3.899665551839465e-06, "loss": 0.011, "step": 66800 }, { "epoch": 6.6899999999999995, "grad_norm": 0.041665125638246536, "learning_rate": 3.89799331103679e-06, "loss": 0.0051, "step": 66900 }, { "epoch": 6.7, "grad_norm": 0.014791757799685001, "learning_rate": 3.896321070234114e-06, "loss": 0.0144, "step": 67000 }, { "epoch": 6.71, "grad_norm": 0.0002211692917626351, "learning_rate": 3.894648829431439e-06, "loss": 0.0057, "step": 67100 }, { "epoch": 6.72, "grad_norm": 3.8692807720508426e-05, "learning_rate": 3.892976588628763e-06, "loss": 0.0115, "step": 67200 }, { "epoch": 6.73, "grad_norm": 0.005168182775378227, "learning_rate": 3.891304347826087e-06, "loss": 0.0051, "step": 67300 }, { "epoch": 6.74, "grad_norm": 0.00041154478094540536, "learning_rate": 3.889632107023412e-06, "loss": 0.0001, "step": 67400 }, { "epoch": 6.75, "grad_norm": 5.148116542841308e-05, "learning_rate": 3.8879598662207366e-06, "loss": 0.0069, "step": 67500 }, { "epoch": 6.76, "grad_norm": 0.0013762086164206266, "learning_rate": 3.886287625418061e-06, "loss": 0.0144, "step": 67600 }, { "epoch": 6.77, "grad_norm": 4.9009013309841976e-05, "learning_rate": 3.884615384615385e-06, "loss": 0.0, "step": 67700 }, { "epoch": 6.78, "grad_norm": 3.336561348987743e-05, "learning_rate": 3.8829431438127095e-06, "loss": 0.0047, "step": 67800 }, { "epoch": 6.79, "grad_norm": 0.0003458741120994091, "learning_rate": 3.8812709030100335e-06, "loss": 0.0098, "step": 67900 }, { "epoch": 6.8, "grad_norm": 0.001628655707463622, "learning_rate": 3.8795986622073584e-06, "loss": 0.0073, "step": 68000 }, { "epoch": 6.8100000000000005, "grad_norm": 5.698153836419806e-05, "learning_rate": 3.8779264214046825e-06, "loss": 0.005, "step": 68100 }, { "epoch": 6.82, "grad_norm": 0.0005441823159344494, "learning_rate": 3.876254180602007e-06, "loss": 0.0006, "step": 68200 }, { "epoch": 6.83, "grad_norm": 0.007364595774561167, "learning_rate": 3.874581939799331e-06, "loss": 0.0014, "step": 68300 }, { "epoch": 6.84, "grad_norm": 0.004882550798356533, "learning_rate": 3.8729096989966554e-06, "loss": 0.0284, "step": 68400 }, { "epoch": 6.85, "grad_norm": 0.1382811814546585, "learning_rate": 3.87123745819398e-06, "loss": 0.0095, "step": 68500 }, { "epoch": 6.86, "grad_norm": 0.003247621236369014, "learning_rate": 3.869565217391304e-06, "loss": 0.0126, "step": 68600 }, { "epoch": 6.87, "grad_norm": 0.0005879114614799619, "learning_rate": 3.867892976588629e-06, "loss": 0.0073, "step": 68700 }, { "epoch": 6.88, "grad_norm": 0.00033930817153304815, "learning_rate": 3.866220735785953e-06, "loss": 0.0001, "step": 68800 }, { "epoch": 6.89, "grad_norm": 0.0021851826459169388, "learning_rate": 3.864548494983278e-06, "loss": 0.0097, "step": 68900 }, { "epoch": 6.9, "grad_norm": 0.00033705090754665434, "learning_rate": 3.862876254180602e-06, "loss": 0.0038, "step": 69000 }, { "epoch": 6.91, "grad_norm": 3.90254681406077e-05, "learning_rate": 3.861204013377926e-06, "loss": 0.0047, "step": 69100 }, { "epoch": 6.92, "grad_norm": 0.06699187308549881, "learning_rate": 3.859531772575251e-06, "loss": 0.0001, "step": 69200 }, { "epoch": 6.93, "grad_norm": 2.483546813891735e-05, "learning_rate": 3.857859531772575e-06, "loss": 0.0046, "step": 69300 }, { "epoch": 6.9399999999999995, "grad_norm": 6.92076500854455e-05, "learning_rate": 3.8561872909699e-06, "loss": 0.0189, "step": 69400 }, { "epoch": 6.95, "grad_norm": 7.609539898112416e-05, "learning_rate": 3.854515050167225e-06, "loss": 0.0055, "step": 69500 }, { "epoch": 6.96, "grad_norm": 2.9781913326587528e-05, "learning_rate": 3.852842809364549e-06, "loss": 0.0109, "step": 69600 }, { "epoch": 6.97, "grad_norm": 9.487575880484655e-05, "learning_rate": 3.851170568561873e-06, "loss": 0.0001, "step": 69700 }, { "epoch": 6.98, "grad_norm": 0.03806305304169655, "learning_rate": 3.849498327759197e-06, "loss": 0.0115, "step": 69800 }, { "epoch": 6.99, "grad_norm": 0.0004042502259835601, "learning_rate": 3.847826086956522e-06, "loss": 0.0135, "step": 69900 }, { "epoch": 7.0, "grad_norm": 0.047401949763298035, "learning_rate": 3.846153846153847e-06, "loss": 0.0115, "step": 70000 }, { "epoch": 7.0, "eval_accuracy": 0.986475, "eval_f1": 0.986475, "eval_loss": 0.09723663330078125, "eval_runtime": 133.6436, "eval_samples_per_second": 299.304, "eval_steps_per_second": 299.304, "step": 70000 }, { "epoch": 7.01, "grad_norm": 5.497029997059144e-05, "learning_rate": 3.844481605351171e-06, "loss": 0.0001, "step": 70100 }, { "epoch": 7.02, "grad_norm": 0.04041426256299019, "learning_rate": 3.842809364548496e-06, "loss": 0.0001, "step": 70200 }, { "epoch": 7.03, "grad_norm": 0.000893338059540838, "learning_rate": 3.84113712374582e-06, "loss": 0.0007, "step": 70300 }, { "epoch": 7.04, "grad_norm": 3.694546830956824e-05, "learning_rate": 3.839464882943144e-06, "loss": 0.0008, "step": 70400 }, { "epoch": 7.05, "grad_norm": 0.0002109079505316913, "learning_rate": 3.837792642140469e-06, "loss": 0.0098, "step": 70500 }, { "epoch": 7.06, "grad_norm": 0.000186313918675296, "learning_rate": 3.836120401337793e-06, "loss": 0.0, "step": 70600 }, { "epoch": 7.07, "grad_norm": 0.03715241327881813, "learning_rate": 3.834448160535118e-06, "loss": 0.009, "step": 70700 }, { "epoch": 7.08, "grad_norm": 4.64997865492478e-05, "learning_rate": 3.832775919732442e-06, "loss": 0.0061, "step": 70800 }, { "epoch": 7.09, "grad_norm": 4.012215504189953e-05, "learning_rate": 3.8311036789297665e-06, "loss": 0.0002, "step": 70900 }, { "epoch": 7.1, "grad_norm": 3.099231980741024e-05, "learning_rate": 3.8294314381270906e-06, "loss": 0.0072, "step": 71000 }, { "epoch": 7.11, "grad_norm": 3.7772646464873105e-05, "learning_rate": 3.827759197324415e-06, "loss": 0.0093, "step": 71100 }, { "epoch": 7.12, "grad_norm": 7.007938256720081e-05, "learning_rate": 3.8260869565217395e-06, "loss": 0.008, "step": 71200 }, { "epoch": 7.13, "grad_norm": 8.04358787718229e-05, "learning_rate": 3.8244147157190635e-06, "loss": 0.0003, "step": 71300 }, { "epoch": 7.14, "grad_norm": 0.009740136563777924, "learning_rate": 3.822742474916388e-06, "loss": 0.0034, "step": 71400 }, { "epoch": 7.15, "grad_norm": 4.2379640945000574e-05, "learning_rate": 3.821070234113713e-06, "loss": 0.003, "step": 71500 }, { "epoch": 7.16, "grad_norm": 0.001223787316121161, "learning_rate": 3.819397993311037e-06, "loss": 0.0, "step": 71600 }, { "epoch": 7.17, "grad_norm": 3.370898775756359e-05, "learning_rate": 3.817725752508361e-06, "loss": 0.0, "step": 71700 }, { "epoch": 7.18, "grad_norm": 0.0017037318320944905, "learning_rate": 3.816053511705685e-06, "loss": 0.012, "step": 71800 }, { "epoch": 7.19, "grad_norm": 0.1010851189494133, "learning_rate": 3.8143812709030103e-06, "loss": 0.0086, "step": 71900 }, { "epoch": 7.2, "grad_norm": 3.95359966205433e-05, "learning_rate": 3.812709030100335e-06, "loss": 0.0053, "step": 72000 }, { "epoch": 7.21, "grad_norm": 0.03214002400636673, "learning_rate": 3.811036789297659e-06, "loss": 0.0, "step": 72100 }, { "epoch": 7.22, "grad_norm": 9.369623876409605e-05, "learning_rate": 3.8093645484949837e-06, "loss": 0.0018, "step": 72200 }, { "epoch": 7.23, "grad_norm": 4.141992030781694e-05, "learning_rate": 3.8076923076923077e-06, "loss": 0.0078, "step": 72300 }, { "epoch": 7.24, "grad_norm": 0.006547427736222744, "learning_rate": 3.8060200668896326e-06, "loss": 0.0043, "step": 72400 }, { "epoch": 7.25, "grad_norm": 0.009587573818862438, "learning_rate": 3.804347826086957e-06, "loss": 0.0, "step": 72500 }, { "epoch": 7.26, "grad_norm": 0.0001476012112107128, "learning_rate": 3.802675585284281e-06, "loss": 0.0156, "step": 72600 }, { "epoch": 7.27, "grad_norm": 0.00019884717767126858, "learning_rate": 3.801003344481606e-06, "loss": 0.0, "step": 72700 }, { "epoch": 7.28, "grad_norm": 0.0004751971864607185, "learning_rate": 3.79933110367893e-06, "loss": 0.0, "step": 72800 }, { "epoch": 7.29, "grad_norm": 0.00021417044627014548, "learning_rate": 3.7976588628762545e-06, "loss": 0.0, "step": 72900 }, { "epoch": 7.3, "grad_norm": 2.5816289053182118e-05, "learning_rate": 3.7959866220735793e-06, "loss": 0.0051, "step": 73000 }, { "epoch": 7.31, "grad_norm": 3.802667197305709e-05, "learning_rate": 3.7943143812709034e-06, "loss": 0.0079, "step": 73100 }, { "epoch": 7.32, "grad_norm": 7.326734339585528e-05, "learning_rate": 3.792642140468228e-06, "loss": 0.0001, "step": 73200 }, { "epoch": 7.33, "grad_norm": 2.148441126337275e-05, "learning_rate": 3.790969899665552e-06, "loss": 0.0097, "step": 73300 }, { "epoch": 7.34, "grad_norm": 0.000500816386193037, "learning_rate": 3.7892976588628768e-06, "loss": 0.0036, "step": 73400 }, { "epoch": 7.35, "grad_norm": 2.385185143793933e-05, "learning_rate": 3.7876254180602012e-06, "loss": 0.0039, "step": 73500 }, { "epoch": 7.36, "grad_norm": 2.7678737751557492e-05, "learning_rate": 3.7859531772575253e-06, "loss": 0.0023, "step": 73600 }, { "epoch": 7.37, "grad_norm": 0.01611788012087345, "learning_rate": 3.78428093645485e-06, "loss": 0.0042, "step": 73700 }, { "epoch": 7.38, "grad_norm": 2.1817544620716944e-05, "learning_rate": 3.782608695652174e-06, "loss": 0.0033, "step": 73800 }, { "epoch": 7.39, "grad_norm": 2.7087255148217082e-05, "learning_rate": 3.7809364548494986e-06, "loss": 0.0071, "step": 73900 }, { "epoch": 7.4, "grad_norm": 1.970182165678125e-05, "learning_rate": 3.7792642140468235e-06, "loss": 0.0025, "step": 74000 }, { "epoch": 7.41, "grad_norm": 7.84520962042734e-05, "learning_rate": 3.7775919732441476e-06, "loss": 0.0008, "step": 74100 }, { "epoch": 7.42, "grad_norm": 3.033554276044015e-05, "learning_rate": 3.775919732441472e-06, "loss": 0.011, "step": 74200 }, { "epoch": 7.43, "grad_norm": 0.0001484583190176636, "learning_rate": 3.774247491638796e-06, "loss": 0.0061, "step": 74300 }, { "epoch": 7.44, "grad_norm": 0.00032925844425335526, "learning_rate": 3.772575250836121e-06, "loss": 0.0, "step": 74400 }, { "epoch": 7.45, "grad_norm": 0.0002478963870089501, "learning_rate": 3.7709030100334454e-06, "loss": 0.0059, "step": 74500 }, { "epoch": 7.46, "grad_norm": 0.0003401384165044874, "learning_rate": 3.7692307692307694e-06, "loss": 0.007, "step": 74600 }, { "epoch": 7.47, "grad_norm": 0.0001823725615395233, "learning_rate": 3.7675585284280943e-06, "loss": 0.006, "step": 74700 }, { "epoch": 7.48, "grad_norm": 0.0011837665224447846, "learning_rate": 3.7658862876254184e-06, "loss": 0.0191, "step": 74800 }, { "epoch": 7.49, "grad_norm": 0.003498908132314682, "learning_rate": 3.764214046822743e-06, "loss": 0.0, "step": 74900 }, { "epoch": 7.5, "grad_norm": 0.0009637516341172159, "learning_rate": 3.7625418060200673e-06, "loss": 0.0, "step": 75000 }, { "epoch": 7.5, "eval_accuracy": 0.9865, "eval_f1": 0.9865, "eval_loss": 0.12122364342212677, "eval_runtime": 133.3981, "eval_samples_per_second": 299.854, "eval_steps_per_second": 299.854, "step": 75000 }, { "epoch": 7.51, "grad_norm": 0.0003032281238120049, "learning_rate": 3.7608695652173917e-06, "loss": 0.0118, "step": 75100 }, { "epoch": 7.52, "grad_norm": 4.516799162956886e-05, "learning_rate": 3.759197324414716e-06, "loss": 0.0128, "step": 75200 }, { "epoch": 7.53, "grad_norm": 0.00011762830399675295, "learning_rate": 3.7575250836120402e-06, "loss": 0.001, "step": 75300 }, { "epoch": 7.54, "grad_norm": 0.05131925269961357, "learning_rate": 3.755852842809365e-06, "loss": 0.0001, "step": 75400 }, { "epoch": 7.55, "grad_norm": 1.698526284599211e-05, "learning_rate": 3.7541806020066896e-06, "loss": 0.0, "step": 75500 }, { "epoch": 7.5600000000000005, "grad_norm": 0.03275621682405472, "learning_rate": 3.7525083612040136e-06, "loss": 0.0095, "step": 75600 }, { "epoch": 7.57, "grad_norm": 3.255869887652807e-05, "learning_rate": 3.750836120401338e-06, "loss": 0.0044, "step": 75700 }, { "epoch": 7.58, "grad_norm": 0.10182749480009079, "learning_rate": 3.7491638795986625e-06, "loss": 0.0028, "step": 75800 }, { "epoch": 7.59, "grad_norm": 9.048231731867418e-05, "learning_rate": 3.747491638795987e-06, "loss": 0.0, "step": 75900 }, { "epoch": 7.6, "grad_norm": 1.5391413398901932e-05, "learning_rate": 3.745819397993311e-06, "loss": 0.0037, "step": 76000 }, { "epoch": 7.61, "grad_norm": 2.5761735741980374e-05, "learning_rate": 3.7441471571906355e-06, "loss": 0.0001, "step": 76100 }, { "epoch": 7.62, "grad_norm": 1.6684647562215105e-05, "learning_rate": 3.7424749163879604e-06, "loss": 0.0067, "step": 76200 }, { "epoch": 7.63, "grad_norm": 3.152184217469767e-05, "learning_rate": 3.7408026755852844e-06, "loss": 0.0096, "step": 76300 }, { "epoch": 7.64, "grad_norm": 0.0017304780194535851, "learning_rate": 3.739130434782609e-06, "loss": 0.0122, "step": 76400 }, { "epoch": 7.65, "grad_norm": 0.004649181850254536, "learning_rate": 3.7374581939799333e-06, "loss": 0.0099, "step": 76500 }, { "epoch": 7.66, "grad_norm": 2.4674011001479812e-05, "learning_rate": 3.735785953177258e-06, "loss": 0.0, "step": 76600 }, { "epoch": 7.67, "grad_norm": 4.725448525277898e-05, "learning_rate": 3.7341137123745823e-06, "loss": 0.0009, "step": 76700 }, { "epoch": 7.68, "grad_norm": 2.612950629554689e-05, "learning_rate": 3.7324414715719063e-06, "loss": 0.0046, "step": 76800 }, { "epoch": 7.6899999999999995, "grad_norm": 0.00784962996840477, "learning_rate": 3.730769230769231e-06, "loss": 0.0043, "step": 76900 }, { "epoch": 7.7, "grad_norm": 0.0011779394699260592, "learning_rate": 3.7290969899665552e-06, "loss": 0.0072, "step": 77000 }, { "epoch": 7.71, "grad_norm": 0.004041343927383423, "learning_rate": 3.7274247491638797e-06, "loss": 0.0083, "step": 77100 }, { "epoch": 7.72, "grad_norm": 0.0005305418744683266, "learning_rate": 3.7257525083612046e-06, "loss": 0.0062, "step": 77200 }, { "epoch": 7.73, "grad_norm": 4.281869769329205e-05, "learning_rate": 3.7240802675585286e-06, "loss": 0.0054, "step": 77300 }, { "epoch": 7.74, "grad_norm": 9.92060377029702e-05, "learning_rate": 3.722408026755853e-06, "loss": 0.0031, "step": 77400 }, { "epoch": 7.75, "grad_norm": 7.640924013685435e-05, "learning_rate": 3.720735785953177e-06, "loss": 0.013, "step": 77500 }, { "epoch": 7.76, "grad_norm": 0.0024570985697209835, "learning_rate": 3.719063545150502e-06, "loss": 0.0027, "step": 77600 }, { "epoch": 7.77, "grad_norm": 6.261836097110063e-05, "learning_rate": 3.7173913043478264e-06, "loss": 0.0045, "step": 77700 }, { "epoch": 7.78, "grad_norm": 0.0010023782961070538, "learning_rate": 3.7157190635451505e-06, "loss": 0.0043, "step": 77800 }, { "epoch": 7.79, "grad_norm": 8.210300438804552e-05, "learning_rate": 3.7140468227424754e-06, "loss": 0.0065, "step": 77900 }, { "epoch": 7.8, "grad_norm": 4.371823888504878e-05, "learning_rate": 3.7123745819397994e-06, "loss": 0.0074, "step": 78000 }, { "epoch": 7.8100000000000005, "grad_norm": 0.00750390999019146, "learning_rate": 3.710702341137124e-06, "loss": 0.0105, "step": 78100 }, { "epoch": 7.82, "grad_norm": 5.786586189060472e-05, "learning_rate": 3.7090301003344487e-06, "loss": 0.008, "step": 78200 }, { "epoch": 7.83, "grad_norm": 0.0022236057557165623, "learning_rate": 3.7073578595317728e-06, "loss": 0.0001, "step": 78300 }, { "epoch": 7.84, "grad_norm": 0.001743564149364829, "learning_rate": 3.7056856187290972e-06, "loss": 0.0195, "step": 78400 }, { "epoch": 7.85, "grad_norm": 0.00038661062717437744, "learning_rate": 3.7040133779264213e-06, "loss": 0.0029, "step": 78500 }, { "epoch": 7.86, "grad_norm": 0.0006590617704205215, "learning_rate": 3.702341137123746e-06, "loss": 0.0109, "step": 78600 }, { "epoch": 7.87, "grad_norm": 3.630742139648646e-05, "learning_rate": 3.7006688963210706e-06, "loss": 0.0, "step": 78700 }, { "epoch": 7.88, "grad_norm": 20.627477645874023, "learning_rate": 3.6989966555183947e-06, "loss": 0.0084, "step": 78800 }, { "epoch": 7.89, "grad_norm": 0.0004545499396044761, "learning_rate": 3.6973244147157195e-06, "loss": 0.0081, "step": 78900 }, { "epoch": 7.9, "grad_norm": 4.7894198360154405e-05, "learning_rate": 3.6956521739130436e-06, "loss": 0.0125, "step": 79000 }, { "epoch": 7.91, "grad_norm": 0.00010200806718785316, "learning_rate": 3.693979933110368e-06, "loss": 0.0033, "step": 79100 }, { "epoch": 7.92, "grad_norm": 0.03108019381761551, "learning_rate": 3.692307692307693e-06, "loss": 0.0056, "step": 79200 }, { "epoch": 7.93, "grad_norm": 2.7014459192287177e-05, "learning_rate": 3.690635451505017e-06, "loss": 0.0001, "step": 79300 }, { "epoch": 7.9399999999999995, "grad_norm": 0.001831627101637423, "learning_rate": 3.6889632107023414e-06, "loss": 0.0087, "step": 79400 }, { "epoch": 7.95, "grad_norm": 3.703348556882702e-05, "learning_rate": 3.6872909698996655e-06, "loss": 0.0008, "step": 79500 }, { "epoch": 7.96, "grad_norm": 0.013346439227461815, "learning_rate": 3.6856187290969903e-06, "loss": 0.0008, "step": 79600 }, { "epoch": 7.97, "grad_norm": 6.900202424731106e-05, "learning_rate": 3.683946488294315e-06, "loss": 0.0054, "step": 79700 }, { "epoch": 7.98, "grad_norm": 0.014959572814404964, "learning_rate": 3.682274247491639e-06, "loss": 0.0126, "step": 79800 }, { "epoch": 7.99, "grad_norm": 0.0004625216533895582, "learning_rate": 3.6806020066889637e-06, "loss": 0.0009, "step": 79900 }, { "epoch": 8.0, "grad_norm": 0.00030464696465060115, "learning_rate": 3.6789297658862878e-06, "loss": 0.003, "step": 80000 }, { "epoch": 8.0, "eval_accuracy": 0.9853, "eval_f1": 0.9853, "eval_loss": 0.1378253996372223, "eval_runtime": 134.8364, "eval_samples_per_second": 296.656, "eval_steps_per_second": 296.656, "step": 80000 }, { "epoch": 8.01, "grad_norm": 2.883007073251065e-05, "learning_rate": 3.6772575250836122e-06, "loss": 0.0, "step": 80100 }, { "epoch": 8.02, "grad_norm": 0.001146984868682921, "learning_rate": 3.675585284280937e-06, "loss": 0.0, "step": 80200 }, { "epoch": 8.03, "grad_norm": 0.0017075365176424384, "learning_rate": 3.673913043478261e-06, "loss": 0.0049, "step": 80300 }, { "epoch": 8.04, "grad_norm": 1.7525797375128604e-05, "learning_rate": 3.6722408026755856e-06, "loss": 0.0042, "step": 80400 }, { "epoch": 8.05, "grad_norm": 3.318498420412652e-05, "learning_rate": 3.6705685618729096e-06, "loss": 0.0018, "step": 80500 }, { "epoch": 8.06, "grad_norm": 1.8242984879179858e-05, "learning_rate": 3.6688963210702345e-06, "loss": 0.0114, "step": 80600 }, { "epoch": 8.07, "grad_norm": 2.8576640033861622e-05, "learning_rate": 3.667224080267559e-06, "loss": 0.0061, "step": 80700 }, { "epoch": 8.08, "grad_norm": 2.6531528419582173e-05, "learning_rate": 3.665551839464883e-06, "loss": 0.0, "step": 80800 }, { "epoch": 8.09, "grad_norm": 3.302233017166145e-05, "learning_rate": 3.663879598662208e-06, "loss": 0.0037, "step": 80900 }, { "epoch": 8.1, "grad_norm": 3.1501982448389754e-05, "learning_rate": 3.662207357859532e-06, "loss": 0.0055, "step": 81000 }, { "epoch": 8.11, "grad_norm": 0.00011675096902763471, "learning_rate": 3.6605351170568564e-06, "loss": 0.0051, "step": 81100 }, { "epoch": 8.12, "grad_norm": 8.468510350212455e-05, "learning_rate": 3.6588628762541813e-06, "loss": 0.0, "step": 81200 }, { "epoch": 8.13, "grad_norm": 3.6243935028323904e-05, "learning_rate": 3.6571906354515053e-06, "loss": 0.0092, "step": 81300 }, { "epoch": 8.14, "grad_norm": 6.308049341896549e-05, "learning_rate": 3.6555183946488298e-06, "loss": 0.0, "step": 81400 }, { "epoch": 8.15, "grad_norm": 3.311773616587743e-05, "learning_rate": 3.653846153846154e-06, "loss": 0.0016, "step": 81500 }, { "epoch": 8.16, "grad_norm": 0.00015161107876338065, "learning_rate": 3.6521739130434787e-06, "loss": 0.0155, "step": 81600 }, { "epoch": 8.17, "grad_norm": 1.4969894436944742e-05, "learning_rate": 3.650501672240803e-06, "loss": 0.0071, "step": 81700 }, { "epoch": 8.18, "grad_norm": 2.1335055862437002e-05, "learning_rate": 3.648829431438127e-06, "loss": 0.0029, "step": 81800 }, { "epoch": 8.19, "grad_norm": 6.477204442489892e-05, "learning_rate": 3.647157190635452e-06, "loss": 0.0, "step": 81900 }, { "epoch": 8.2, "grad_norm": 0.3152509331703186, "learning_rate": 3.645484949832776e-06, "loss": 0.0129, "step": 82000 }, { "epoch": 8.21, "grad_norm": 2.7150046662427485e-05, "learning_rate": 3.6438127090301006e-06, "loss": 0.0062, "step": 82100 }, { "epoch": 8.22, "grad_norm": 0.0003274416958447546, "learning_rate": 3.6421404682274255e-06, "loss": 0.0049, "step": 82200 }, { "epoch": 8.23, "grad_norm": 0.014805939979851246, "learning_rate": 3.6404682274247495e-06, "loss": 0.0005, "step": 82300 }, { "epoch": 8.24, "grad_norm": 0.07976293563842773, "learning_rate": 3.638795986622074e-06, "loss": 0.0061, "step": 82400 }, { "epoch": 8.25, "grad_norm": 0.000629770162049681, "learning_rate": 3.637123745819398e-06, "loss": 0.0055, "step": 82500 }, { "epoch": 8.26, "grad_norm": 5.8729201555252075e-05, "learning_rate": 3.635451505016723e-06, "loss": 0.0095, "step": 82600 }, { "epoch": 8.27, "grad_norm": 2.821264206431806e-05, "learning_rate": 3.6337792642140473e-06, "loss": 0.0018, "step": 82700 }, { "epoch": 8.28, "grad_norm": 2.697145646379795e-05, "learning_rate": 3.6321070234113714e-06, "loss": 0.0058, "step": 82800 }, { "epoch": 8.29, "grad_norm": 2.547060103097465e-05, "learning_rate": 3.6304347826086963e-06, "loss": 0.0004, "step": 82900 }, { "epoch": 8.3, "grad_norm": 2.24012019316433e-05, "learning_rate": 3.6287625418060203e-06, "loss": 0.0011, "step": 83000 }, { "epoch": 8.31, "grad_norm": 5.2949766541132703e-05, "learning_rate": 3.6270903010033448e-06, "loss": 0.0033, "step": 83100 }, { "epoch": 8.32, "grad_norm": 0.12880505621433258, "learning_rate": 3.6254180602006696e-06, "loss": 0.0051, "step": 83200 }, { "epoch": 8.33, "grad_norm": 5.116287138662301e-05, "learning_rate": 3.6237458193979937e-06, "loss": 0.0028, "step": 83300 }, { "epoch": 8.34, "grad_norm": 2.302383836649824e-05, "learning_rate": 3.622073578595318e-06, "loss": 0.005, "step": 83400 }, { "epoch": 8.35, "grad_norm": 0.002800439950078726, "learning_rate": 3.620401337792642e-06, "loss": 0.0075, "step": 83500 }, { "epoch": 8.36, "grad_norm": 3.3340267691528425e-05, "learning_rate": 3.618729096989967e-06, "loss": 0.0055, "step": 83600 }, { "epoch": 8.37, "grad_norm": 5.368978236219846e-05, "learning_rate": 3.6170568561872915e-06, "loss": 0.0037, "step": 83700 }, { "epoch": 8.38, "grad_norm": 4.120146331842989e-05, "learning_rate": 3.6153846153846156e-06, "loss": 0.0015, "step": 83800 }, { "epoch": 8.39, "grad_norm": 2.2930082195671275e-05, "learning_rate": 3.6137123745819404e-06, "loss": 0.0104, "step": 83900 }, { "epoch": 8.4, "grad_norm": 1.8325974451727234e-05, "learning_rate": 3.6120401337792645e-06, "loss": 0.0085, "step": 84000 }, { "epoch": 8.41, "grad_norm": 1.8396835002931766e-05, "learning_rate": 3.610367892976589e-06, "loss": 0.0, "step": 84100 }, { "epoch": 8.42, "grad_norm": 1.9814324332401156e-05, "learning_rate": 3.6086956521739134e-06, "loss": 0.0, "step": 84200 }, { "epoch": 8.43, "grad_norm": 48.710208892822266, "learning_rate": 3.607023411371238e-06, "loss": 0.015, "step": 84300 }, { "epoch": 8.44, "grad_norm": 2.536492502258625e-05, "learning_rate": 3.6053511705685623e-06, "loss": 0.0078, "step": 84400 }, { "epoch": 8.45, "grad_norm": 0.00015233943122439086, "learning_rate": 3.6036789297658864e-06, "loss": 0.014, "step": 84500 }, { "epoch": 8.46, "grad_norm": 1.858683390310034e-05, "learning_rate": 3.6020066889632112e-06, "loss": 0.0016, "step": 84600 }, { "epoch": 8.47, "grad_norm": 1.7697208022582345e-05, "learning_rate": 3.6003344481605357e-06, "loss": 0.0026, "step": 84700 }, { "epoch": 8.48, "grad_norm": 3.1560120987705886e-05, "learning_rate": 3.5986622073578597e-06, "loss": 0.0, "step": 84800 }, { "epoch": 8.49, "grad_norm": 1.6911069906200282e-05, "learning_rate": 3.596989966555184e-06, "loss": 0.0038, "step": 84900 }, { "epoch": 8.5, "grad_norm": 2.0758106984430924e-05, "learning_rate": 3.5953177257525087e-06, "loss": 0.0005, "step": 85000 }, { "epoch": 8.5, "eval_accuracy": 0.9857, "eval_f1": 0.9857, "eval_loss": 0.13761389255523682, "eval_runtime": 135.2379, "eval_samples_per_second": 295.775, "eval_steps_per_second": 295.775, "step": 85000 }, { "epoch": 8.51, "grad_norm": 6.27836343483068e-05, "learning_rate": 3.593645484949833e-06, "loss": 0.0204, "step": 85100 }, { "epoch": 8.52, "grad_norm": 1.9296541722724214e-05, "learning_rate": 3.5919732441471576e-06, "loss": 0.0039, "step": 85200 }, { "epoch": 8.53, "grad_norm": 0.0005484924768097699, "learning_rate": 3.590301003344482e-06, "loss": 0.0083, "step": 85300 }, { "epoch": 8.54, "grad_norm": 1.317109854426235e-05, "learning_rate": 3.5886287625418065e-06, "loss": 0.0, "step": 85400 }, { "epoch": 8.55, "grad_norm": 1.4201648809830658e-05, "learning_rate": 3.5869565217391305e-06, "loss": 0.0061, "step": 85500 }, { "epoch": 8.56, "grad_norm": 4.045464811497368e-05, "learning_rate": 3.585284280936455e-06, "loss": 0.0, "step": 85600 }, { "epoch": 8.57, "grad_norm": 4.9041085731005296e-05, "learning_rate": 3.58361204013378e-06, "loss": 0.0001, "step": 85700 }, { "epoch": 8.58, "grad_norm": 0.0012458403361961246, "learning_rate": 3.581939799331104e-06, "loss": 0.0035, "step": 85800 }, { "epoch": 8.59, "grad_norm": 1.6935151506913826e-05, "learning_rate": 3.5802675585284284e-06, "loss": 0.0003, "step": 85900 }, { "epoch": 8.6, "grad_norm": 2.6776717277243733e-05, "learning_rate": 3.578595317725753e-06, "loss": 0.0002, "step": 86000 }, { "epoch": 8.61, "grad_norm": 2.0100162146263756e-05, "learning_rate": 3.5769230769230773e-06, "loss": 0.0102, "step": 86100 }, { "epoch": 8.62, "grad_norm": 1.1070689652115107e-05, "learning_rate": 3.5752508361204013e-06, "loss": 0.0, "step": 86200 }, { "epoch": 8.63, "grad_norm": 1.861604869191069e-05, "learning_rate": 3.573578595317726e-06, "loss": 0.0044, "step": 86300 }, { "epoch": 8.64, "grad_norm": 8.287372475024313e-05, "learning_rate": 3.5719063545150507e-06, "loss": 0.006, "step": 86400 }, { "epoch": 8.65, "grad_norm": 1.6306277757394128e-05, "learning_rate": 3.5702341137123747e-06, "loss": 0.0166, "step": 86500 }, { "epoch": 8.66, "grad_norm": 0.00019994894682895392, "learning_rate": 3.568561872909699e-06, "loss": 0.0032, "step": 86600 }, { "epoch": 8.67, "grad_norm": 1.5932404494378716e-05, "learning_rate": 3.5668896321070236e-06, "loss": 0.0048, "step": 86700 }, { "epoch": 8.68, "grad_norm": 4.9370060878572986e-05, "learning_rate": 3.565217391304348e-06, "loss": 0.0027, "step": 86800 }, { "epoch": 8.69, "grad_norm": 1.9548153431969695e-05, "learning_rate": 3.5635451505016726e-06, "loss": 0.0, "step": 86900 }, { "epoch": 8.7, "grad_norm": 1.9069799236604013e-05, "learning_rate": 3.5618729096989966e-06, "loss": 0.008, "step": 87000 }, { "epoch": 8.71, "grad_norm": 0.00030942470766603947, "learning_rate": 3.5602006688963215e-06, "loss": 0.014, "step": 87100 }, { "epoch": 8.72, "grad_norm": 0.00028676740475930274, "learning_rate": 3.5585284280936455e-06, "loss": 0.0001, "step": 87200 }, { "epoch": 8.73, "grad_norm": 0.00011423743126215413, "learning_rate": 3.55685618729097e-06, "loss": 0.0036, "step": 87300 }, { "epoch": 8.74, "grad_norm": 2.1552994439844042e-05, "learning_rate": 3.555183946488295e-06, "loss": 0.0, "step": 87400 }, { "epoch": 8.75, "grad_norm": 1.99559326574672e-05, "learning_rate": 3.553511705685619e-06, "loss": 0.0018, "step": 87500 }, { "epoch": 8.76, "grad_norm": 1.4471507711277809e-05, "learning_rate": 3.5518394648829434e-06, "loss": 0.0058, "step": 87600 }, { "epoch": 8.77, "grad_norm": 0.00047445972450077534, "learning_rate": 3.5501672240802674e-06, "loss": 0.0, "step": 87700 }, { "epoch": 8.78, "grad_norm": 7.757473940728232e-05, "learning_rate": 3.5484949832775923e-06, "loss": 0.0, "step": 87800 }, { "epoch": 8.79, "grad_norm": 2.4758850486250594e-05, "learning_rate": 3.5468227424749167e-06, "loss": 0.0, "step": 87900 }, { "epoch": 8.8, "grad_norm": 3.1655439670430496e-05, "learning_rate": 3.5451505016722408e-06, "loss": 0.0, "step": 88000 }, { "epoch": 8.81, "grad_norm": 0.0323539599776268, "learning_rate": 3.5434782608695657e-06, "loss": 0.0066, "step": 88100 }, { "epoch": 8.82, "grad_norm": 2.997820411110297e-05, "learning_rate": 3.5418060200668897e-06, "loss": 0.0019, "step": 88200 }, { "epoch": 8.83, "grad_norm": 2.695979674172122e-05, "learning_rate": 3.540133779264214e-06, "loss": 0.0037, "step": 88300 }, { "epoch": 8.84, "grad_norm": 0.00042679201578721404, "learning_rate": 3.538461538461539e-06, "loss": 0.0101, "step": 88400 }, { "epoch": 8.85, "grad_norm": 0.0005938719259575009, "learning_rate": 3.536789297658863e-06, "loss": 0.0, "step": 88500 }, { "epoch": 8.86, "grad_norm": 1.996893115574494e-05, "learning_rate": 3.5351170568561875e-06, "loss": 0.0093, "step": 88600 }, { "epoch": 8.87, "grad_norm": 1.4617453416576609e-05, "learning_rate": 3.5334448160535116e-06, "loss": 0.0053, "step": 88700 }, { "epoch": 8.88, "grad_norm": 2.820068402797915e-05, "learning_rate": 3.5317725752508365e-06, "loss": 0.0102, "step": 88800 }, { "epoch": 8.89, "grad_norm": 2.9803282814100385e-05, "learning_rate": 3.530100334448161e-06, "loss": 0.0025, "step": 88900 }, { "epoch": 8.9, "grad_norm": 1.5636564057786018e-05, "learning_rate": 3.528428093645485e-06, "loss": 0.006, "step": 89000 }, { "epoch": 8.91, "grad_norm": 1.9173994587617926e-05, "learning_rate": 3.52675585284281e-06, "loss": 0.0007, "step": 89100 }, { "epoch": 8.92, "grad_norm": 8.140524732880294e-05, "learning_rate": 3.525083612040134e-06, "loss": 0.0103, "step": 89200 }, { "epoch": 8.93, "grad_norm": 2.191733437939547e-05, "learning_rate": 3.5234113712374583e-06, "loss": 0.0, "step": 89300 }, { "epoch": 8.94, "grad_norm": 0.00012123118358431384, "learning_rate": 3.5217391304347832e-06, "loss": 0.0046, "step": 89400 }, { "epoch": 8.95, "grad_norm": 0.00010186954023083672, "learning_rate": 3.5200668896321073e-06, "loss": 0.0054, "step": 89500 }, { "epoch": 8.96, "grad_norm": 0.010097470134496689, "learning_rate": 3.5183946488294317e-06, "loss": 0.0, "step": 89600 }, { "epoch": 8.97, "grad_norm": 0.02606087736785412, "learning_rate": 3.5167224080267558e-06, "loss": 0.0127, "step": 89700 }, { "epoch": 8.98, "grad_norm": 0.005077203270047903, "learning_rate": 3.5150501672240807e-06, "loss": 0.0029, "step": 89800 }, { "epoch": 8.99, "grad_norm": 0.0013905907981097698, "learning_rate": 3.513377926421405e-06, "loss": 0.0073, "step": 89900 }, { "epoch": 9.0, "grad_norm": 6.247458077268675e-05, "learning_rate": 3.511705685618729e-06, "loss": 0.0103, "step": 90000 }, { "epoch": 9.0, "eval_accuracy": 0.986125, "eval_f1": 0.986125, "eval_loss": 0.11607175320386887, "eval_runtime": 138.0617, "eval_samples_per_second": 289.726, "eval_steps_per_second": 289.726, "step": 90000 }, { "epoch": 9.01, "grad_norm": 0.00237733474932611, "learning_rate": 3.510033444816054e-06, "loss": 0.003, "step": 90100 }, { "epoch": 9.02, "grad_norm": 2.1511707018362358e-05, "learning_rate": 3.508361204013378e-06, "loss": 0.0, "step": 90200 }, { "epoch": 9.03, "grad_norm": 1.744981091178488e-05, "learning_rate": 3.5066889632107025e-06, "loss": 0.0, "step": 90300 }, { "epoch": 9.04, "grad_norm": 3.586035018088296e-05, "learning_rate": 3.5050167224080274e-06, "loss": 0.0049, "step": 90400 }, { "epoch": 9.05, "grad_norm": 0.003070132341235876, "learning_rate": 3.5033444816053515e-06, "loss": 0.0073, "step": 90500 }, { "epoch": 9.06, "grad_norm": 0.0012648507254198194, "learning_rate": 3.501672240802676e-06, "loss": 0.0024, "step": 90600 }, { "epoch": 9.07, "grad_norm": 2.4553286493755877e-05, "learning_rate": 3.5e-06, "loss": 0.0027, "step": 90700 }, { "epoch": 9.08, "grad_norm": 0.03265082463622093, "learning_rate": 3.498327759197325e-06, "loss": 0.0055, "step": 90800 }, { "epoch": 9.09, "grad_norm": 2.972443326143548e-05, "learning_rate": 3.4966555183946493e-06, "loss": 0.0, "step": 90900 }, { "epoch": 9.1, "grad_norm": 8.496625378029421e-05, "learning_rate": 3.4949832775919733e-06, "loss": 0.0038, "step": 91000 }, { "epoch": 9.11, "grad_norm": 3.153005309286527e-05, "learning_rate": 3.4933110367892982e-06, "loss": 0.0042, "step": 91100 }, { "epoch": 9.12, "grad_norm": 4.433723370311782e-05, "learning_rate": 3.4916387959866222e-06, "loss": 0.0, "step": 91200 }, { "epoch": 9.13, "grad_norm": 1.847981366154272e-05, "learning_rate": 3.4899665551839467e-06, "loss": 0.0015, "step": 91300 }, { "epoch": 9.14, "grad_norm": 1.2556378351291642e-05, "learning_rate": 3.4882943143812716e-06, "loss": 0.0001, "step": 91400 }, { "epoch": 9.15, "grad_norm": 29.66756248474121, "learning_rate": 3.4866220735785956e-06, "loss": 0.0089, "step": 91500 }, { "epoch": 9.16, "grad_norm": 0.005458858795464039, "learning_rate": 3.48494983277592e-06, "loss": 0.0085, "step": 91600 }, { "epoch": 9.17, "grad_norm": 3.206008477718569e-05, "learning_rate": 3.483277591973244e-06, "loss": 0.0072, "step": 91700 }, { "epoch": 9.18, "grad_norm": 0.0695156380534172, "learning_rate": 3.481605351170569e-06, "loss": 0.0003, "step": 91800 }, { "epoch": 9.19, "grad_norm": 2.1034235032857396e-05, "learning_rate": 3.4799331103678935e-06, "loss": 0.0024, "step": 91900 }, { "epoch": 9.2, "grad_norm": 0.0007182686822488904, "learning_rate": 3.4782608695652175e-06, "loss": 0.0, "step": 92000 }, { "epoch": 9.21, "grad_norm": 5.026232611271553e-05, "learning_rate": 3.4765886287625424e-06, "loss": 0.01, "step": 92100 }, { "epoch": 9.22, "grad_norm": 2.3394995878334157e-05, "learning_rate": 3.4749163879598664e-06, "loss": 0.0142, "step": 92200 }, { "epoch": 9.23, "grad_norm": 8.789805724518374e-05, "learning_rate": 3.473244147157191e-06, "loss": 0.0001, "step": 92300 }, { "epoch": 9.24, "grad_norm": 2.1054107492091134e-05, "learning_rate": 3.4715719063545158e-06, "loss": 0.0011, "step": 92400 }, { "epoch": 9.25, "grad_norm": 0.00026566089945845306, "learning_rate": 3.46989966555184e-06, "loss": 0.0049, "step": 92500 }, { "epoch": 9.26, "grad_norm": 4.639206963474862e-05, "learning_rate": 3.4682274247491643e-06, "loss": 0.0115, "step": 92600 }, { "epoch": 9.27, "grad_norm": 3.660786387627013e-05, "learning_rate": 3.4665551839464883e-06, "loss": 0.0002, "step": 92700 }, { "epoch": 9.28, "grad_norm": 3.224415559088811e-05, "learning_rate": 3.464882943143813e-06, "loss": 0.0148, "step": 92800 }, { "epoch": 9.29, "grad_norm": 0.002478808630257845, "learning_rate": 3.4632107023411377e-06, "loss": 0.0041, "step": 92900 }, { "epoch": 9.3, "grad_norm": 5.7758701586863026e-05, "learning_rate": 3.4615384615384617e-06, "loss": 0.0039, "step": 93000 }, { "epoch": 9.31, "grad_norm": 0.00024893361842259765, "learning_rate": 3.4598662207357866e-06, "loss": 0.0032, "step": 93100 }, { "epoch": 9.32, "grad_norm": 6.84029291733168e-05, "learning_rate": 3.4581939799331106e-06, "loss": 0.0062, "step": 93200 }, { "epoch": 9.33, "grad_norm": 0.0036423206329345703, "learning_rate": 3.456521739130435e-06, "loss": 0.0008, "step": 93300 }, { "epoch": 9.34, "grad_norm": 0.029986459761857986, "learning_rate": 3.4548494983277595e-06, "loss": 0.0, "step": 93400 }, { "epoch": 9.35, "grad_norm": 0.00015107858052942902, "learning_rate": 3.453177257525084e-06, "loss": 0.0, "step": 93500 }, { "epoch": 9.36, "grad_norm": 5.476970545714721e-05, "learning_rate": 3.4515050167224085e-06, "loss": 0.0018, "step": 93600 }, { "epoch": 9.37, "grad_norm": 0.00011632416135398671, "learning_rate": 3.4498327759197325e-06, "loss": 0.0088, "step": 93700 }, { "epoch": 9.38, "grad_norm": 0.0003741678374353796, "learning_rate": 3.4481605351170574e-06, "loss": 0.0045, "step": 93800 }, { "epoch": 9.39, "grad_norm": 0.0009761012624949217, "learning_rate": 3.446488294314382e-06, "loss": 0.0033, "step": 93900 }, { "epoch": 9.4, "grad_norm": 3.911781823262572e-05, "learning_rate": 3.444816053511706e-06, "loss": 0.0019, "step": 94000 }, { "epoch": 9.41, "grad_norm": 1.7435120753361844e-05, "learning_rate": 3.4431438127090303e-06, "loss": 0.0048, "step": 94100 }, { "epoch": 9.42, "grad_norm": 3.1139043130679056e-05, "learning_rate": 3.441471571906355e-06, "loss": 0.0, "step": 94200 }, { "epoch": 9.43, "grad_norm": 0.0002475950459484011, "learning_rate": 3.4397993311036793e-06, "loss": 0.0072, "step": 94300 }, { "epoch": 9.44, "grad_norm": 1.8925727999885567e-05, "learning_rate": 3.4381270903010037e-06, "loss": 0.0018, "step": 94400 }, { "epoch": 9.45, "grad_norm": 1.3380113159655593e-05, "learning_rate": 3.436454849498328e-06, "loss": 0.0, "step": 94500 }, { "epoch": 9.46, "grad_norm": 4.596728831529617e-05, "learning_rate": 3.4347826086956526e-06, "loss": 0.0027, "step": 94600 }, { "epoch": 9.47, "grad_norm": 6.345294241327792e-05, "learning_rate": 3.4331103678929767e-06, "loss": 0.0032, "step": 94700 }, { "epoch": 9.48, "grad_norm": 1.4074191312829498e-05, "learning_rate": 3.431438127090301e-06, "loss": 0.0038, "step": 94800 }, { "epoch": 9.49, "grad_norm": 0.0012907675700262189, "learning_rate": 3.429765886287626e-06, "loss": 0.0, "step": 94900 }, { "epoch": 9.5, "grad_norm": 0.010470391251146793, "learning_rate": 3.42809364548495e-06, "loss": 0.0152, "step": 95000 }, { "epoch": 9.5, "eval_accuracy": 0.9842, "eval_f1": 0.9842, "eval_loss": 0.13753095269203186, "eval_runtime": 137.4145, "eval_samples_per_second": 291.09, "eval_steps_per_second": 291.09, "step": 95000 }, { "epoch": 9.51, "grad_norm": 0.004034177400171757, "learning_rate": 3.4264214046822745e-06, "loss": 0.0143, "step": 95100 }, { "epoch": 9.52, "grad_norm": 0.002143730642274022, "learning_rate": 3.424749163879599e-06, "loss": 0.0018, "step": 95200 }, { "epoch": 9.53, "grad_norm": 0.0022789237555116415, "learning_rate": 3.4230769230769234e-06, "loss": 0.0, "step": 95300 }, { "epoch": 9.54, "grad_norm": 9.519604645902291e-05, "learning_rate": 3.421404682274248e-06, "loss": 0.0159, "step": 95400 }, { "epoch": 9.55, "grad_norm": 0.0038595341611653566, "learning_rate": 3.419732441471572e-06, "loss": 0.004, "step": 95500 }, { "epoch": 9.56, "grad_norm": 2.268450589326676e-05, "learning_rate": 3.418060200668897e-06, "loss": 0.0006, "step": 95600 }, { "epoch": 9.57, "grad_norm": 3.149811891489662e-05, "learning_rate": 3.416387959866221e-06, "loss": 0.0064, "step": 95700 }, { "epoch": 9.58, "grad_norm": 2.6584746592561714e-05, "learning_rate": 3.4147157190635453e-06, "loss": 0.0101, "step": 95800 }, { "epoch": 9.59, "grad_norm": 0.020238105207681656, "learning_rate": 3.4130434782608698e-06, "loss": 0.0022, "step": 95900 }, { "epoch": 9.6, "grad_norm": 0.0001373113045701757, "learning_rate": 3.4113712374581942e-06, "loss": 0.0, "step": 96000 }, { "epoch": 9.61, "grad_norm": 0.0002782682713586837, "learning_rate": 3.4096989966555187e-06, "loss": 0.0, "step": 96100 }, { "epoch": 9.62, "grad_norm": 0.00020216793927829713, "learning_rate": 3.4080267558528427e-06, "loss": 0.0025, "step": 96200 }, { "epoch": 9.63, "grad_norm": 0.02557799033820629, "learning_rate": 3.4063545150501676e-06, "loss": 0.0133, "step": 96300 }, { "epoch": 9.64, "grad_norm": 1.481761228205869e-05, "learning_rate": 3.4046822742474917e-06, "loss": 0.0029, "step": 96400 }, { "epoch": 9.65, "grad_norm": 0.007463955320417881, "learning_rate": 3.403010033444816e-06, "loss": 0.0064, "step": 96500 }, { "epoch": 9.66, "grad_norm": 0.00026536182849667966, "learning_rate": 3.401337792642141e-06, "loss": 0.006, "step": 96600 }, { "epoch": 9.67, "grad_norm": 3.175417077727616e-05, "learning_rate": 3.399665551839465e-06, "loss": 0.0082, "step": 96700 }, { "epoch": 9.68, "grad_norm": 2.764562304946594e-05, "learning_rate": 3.3979933110367895e-06, "loss": 0.0001, "step": 96800 }, { "epoch": 9.69, "grad_norm": 2.0898842194583267e-05, "learning_rate": 3.3963210702341135e-06, "loss": 0.0, "step": 96900 }, { "epoch": 9.7, "grad_norm": 0.00020050223974976689, "learning_rate": 3.3946488294314384e-06, "loss": 0.0012, "step": 97000 }, { "epoch": 9.71, "grad_norm": 4.08537634939421e-05, "learning_rate": 3.392976588628763e-06, "loss": 0.0, "step": 97100 }, { "epoch": 9.72, "grad_norm": 1.0601573194435332e-05, "learning_rate": 3.391304347826087e-06, "loss": 0.0, "step": 97200 }, { "epoch": 9.73, "grad_norm": 2.0686477000708692e-05, "learning_rate": 3.389632107023412e-06, "loss": 0.0, "step": 97300 }, { "epoch": 9.74, "grad_norm": 0.034469928592443466, "learning_rate": 3.387959866220736e-06, "loss": 0.0074, "step": 97400 }, { "epoch": 9.75, "grad_norm": 5.762694854638539e-05, "learning_rate": 3.3862876254180603e-06, "loss": 0.013, "step": 97500 }, { "epoch": 9.76, "grad_norm": 8.739017357584089e-05, "learning_rate": 3.384615384615385e-06, "loss": 0.0065, "step": 97600 }, { "epoch": 9.77, "grad_norm": 1.8738399376161397e-05, "learning_rate": 3.3829431438127092e-06, "loss": 0.0025, "step": 97700 }, { "epoch": 9.78, "grad_norm": 1.5836463717278093e-05, "learning_rate": 3.3812709030100337e-06, "loss": 0.0, "step": 97800 }, { "epoch": 9.79, "grad_norm": 33.77321243286133, "learning_rate": 3.3795986622073577e-06, "loss": 0.0102, "step": 97900 }, { "epoch": 9.8, "grad_norm": 2.430802669550758e-05, "learning_rate": 3.3779264214046826e-06, "loss": 0.0, "step": 98000 }, { "epoch": 9.81, "grad_norm": 1.6347927157767117e-05, "learning_rate": 3.376254180602007e-06, "loss": 0.0, "step": 98100 }, { "epoch": 9.82, "grad_norm": 2.01090078917332e-05, "learning_rate": 3.374581939799331e-06, "loss": 0.008, "step": 98200 }, { "epoch": 9.83, "grad_norm": 0.00010819236194947734, "learning_rate": 3.372909698996656e-06, "loss": 0.0, "step": 98300 }, { "epoch": 9.84, "grad_norm": 0.0038400471676141024, "learning_rate": 3.37123745819398e-06, "loss": 0.0, "step": 98400 }, { "epoch": 9.85, "grad_norm": 6.486551865236834e-05, "learning_rate": 3.3695652173913045e-06, "loss": 0.0001, "step": 98500 }, { "epoch": 9.86, "grad_norm": 0.00033912554499693215, "learning_rate": 3.3678929765886294e-06, "loss": 0.0, "step": 98600 }, { "epoch": 9.87, "grad_norm": 2.700369259400759e-05, "learning_rate": 3.3662207357859534e-06, "loss": 0.0026, "step": 98700 }, { "epoch": 9.88, "grad_norm": 1.5537827493972145e-05, "learning_rate": 3.364548494983278e-06, "loss": 0.0074, "step": 98800 }, { "epoch": 9.89, "grad_norm": 2.3932556359795853e-05, "learning_rate": 3.362876254180602e-06, "loss": 0.0037, "step": 98900 }, { "epoch": 9.9, "grad_norm": 1.4799331438553054e-05, "learning_rate": 3.3612040133779268e-06, "loss": 0.0024, "step": 99000 }, { "epoch": 9.91, "grad_norm": 2.9224020181572996e-05, "learning_rate": 3.3595317725752512e-06, "loss": 0.0041, "step": 99100 }, { "epoch": 9.92, "grad_norm": 0.0006482133758254349, "learning_rate": 3.3578595317725753e-06, "loss": 0.003, "step": 99200 }, { "epoch": 9.93, "grad_norm": 2.1120746168890037e-05, "learning_rate": 3.3561872909699e-06, "loss": 0.01, "step": 99300 }, { "epoch": 9.94, "grad_norm": 5.945864904788323e-05, "learning_rate": 3.354515050167224e-06, "loss": 0.0051, "step": 99400 }, { "epoch": 9.95, "grad_norm": 4.6771976485615596e-05, "learning_rate": 3.3528428093645487e-06, "loss": 0.0, "step": 99500 }, { "epoch": 9.96, "grad_norm": 1.595135108800605e-05, "learning_rate": 3.3511705685618735e-06, "loss": 0.0, "step": 99600 }, { "epoch": 9.97, "grad_norm": 46.883026123046875, "learning_rate": 3.3494983277591976e-06, "loss": 0.0033, "step": 99700 }, { "epoch": 9.98, "grad_norm": 0.00022700062254443765, "learning_rate": 3.347826086956522e-06, "loss": 0.0156, "step": 99800 }, { "epoch": 9.99, "grad_norm": 0.000159778879606165, "learning_rate": 3.346153846153846e-06, "loss": 0.0, "step": 99900 }, { "epoch": 10.0, "grad_norm": 8.880961104296148e-05, "learning_rate": 3.344481605351171e-06, "loss": 0.0001, "step": 100000 }, { "epoch": 10.0, "eval_accuracy": 0.98565, "eval_f1": 0.98565, "eval_loss": 0.12817147374153137, "eval_runtime": 136.2325, "eval_samples_per_second": 293.616, "eval_steps_per_second": 293.616, "step": 100000 }, { "epoch": 10.01, "grad_norm": 3.595802991185337e-05, "learning_rate": 3.3428093645484954e-06, "loss": 0.0095, "step": 100100 }, { "epoch": 10.02, "grad_norm": 0.0004151228931732476, "learning_rate": 3.3411371237458195e-06, "loss": 0.0075, "step": 100200 }, { "epoch": 10.03, "grad_norm": 0.010722924955189228, "learning_rate": 3.3394648829431443e-06, "loss": 0.0, "step": 100300 }, { "epoch": 10.04, "grad_norm": 0.016409115865826607, "learning_rate": 3.3377926421404684e-06, "loss": 0.0, "step": 100400 }, { "epoch": 10.05, "grad_norm": 1.7606289475224912e-05, "learning_rate": 3.336120401337793e-06, "loss": 0.0075, "step": 100500 }, { "epoch": 10.06, "grad_norm": 2.802386188704986e-05, "learning_rate": 3.3344481605351177e-06, "loss": 0.0082, "step": 100600 }, { "epoch": 10.07, "grad_norm": 9.355850488645956e-05, "learning_rate": 3.3327759197324418e-06, "loss": 0.0, "step": 100700 }, { "epoch": 10.08, "grad_norm": 2.488598329364322e-05, "learning_rate": 3.3311036789297662e-06, "loss": 0.0, "step": 100800 }, { "epoch": 10.09, "grad_norm": 1.7552529243403114e-05, "learning_rate": 3.3294314381270903e-06, "loss": 0.0057, "step": 100900 }, { "epoch": 10.1, "grad_norm": 4.751099550048821e-05, "learning_rate": 3.327759197324415e-06, "loss": 0.0, "step": 101000 }, { "epoch": 10.11, "grad_norm": 6.215456232894212e-05, "learning_rate": 3.3260869565217396e-06, "loss": 0.0, "step": 101100 }, { "epoch": 10.12, "grad_norm": 0.012414290569722652, "learning_rate": 3.3244147157190636e-06, "loss": 0.0056, "step": 101200 }, { "epoch": 10.13, "grad_norm": 5.210730159888044e-05, "learning_rate": 3.3227424749163885e-06, "loss": 0.0, "step": 101300 }, { "epoch": 10.14, "grad_norm": 6.133528222562745e-05, "learning_rate": 3.3210702341137126e-06, "loss": 0.0028, "step": 101400 }, { "epoch": 10.15, "grad_norm": 0.003408723743632436, "learning_rate": 3.319397993311037e-06, "loss": 0.0002, "step": 101500 }, { "epoch": 10.16, "grad_norm": 3.5033637686865404e-05, "learning_rate": 3.317725752508362e-06, "loss": 0.0003, "step": 101600 }, { "epoch": 10.17, "grad_norm": 4.893932418781333e-05, "learning_rate": 3.316053511705686e-06, "loss": 0.0021, "step": 101700 }, { "epoch": 10.18, "grad_norm": 3.814970114035532e-05, "learning_rate": 3.3143812709030104e-06, "loss": 0.0024, "step": 101800 }, { "epoch": 10.19, "grad_norm": 1.5137900845729746e-05, "learning_rate": 3.3127090301003344e-06, "loss": 0.0017, "step": 101900 }, { "epoch": 10.2, "grad_norm": 2.083205254166387e-05, "learning_rate": 3.3110367892976593e-06, "loss": 0.0104, "step": 102000 }, { "epoch": 10.21, "grad_norm": 0.003052659798413515, "learning_rate": 3.3093645484949838e-06, "loss": 0.0001, "step": 102100 }, { "epoch": 10.22, "grad_norm": 2.759070230240468e-05, "learning_rate": 3.307692307692308e-06, "loss": 0.0047, "step": 102200 }, { "epoch": 10.23, "grad_norm": 2.606885027489625e-05, "learning_rate": 3.3060200668896327e-06, "loss": 0.0002, "step": 102300 }, { "epoch": 10.24, "grad_norm": 1.9891685951733962e-05, "learning_rate": 3.3043478260869567e-06, "loss": 0.0, "step": 102400 }, { "epoch": 10.25, "grad_norm": 0.0007487214170396328, "learning_rate": 3.302675585284281e-06, "loss": 0.0, "step": 102500 }, { "epoch": 10.26, "grad_norm": 1.7976259186980315e-05, "learning_rate": 3.3010033444816057e-06, "loss": 0.0057, "step": 102600 }, { "epoch": 10.27, "grad_norm": 1.4554972949554212e-05, "learning_rate": 3.29933110367893e-06, "loss": 0.0, "step": 102700 }, { "epoch": 10.28, "grad_norm": 4.100278238183819e-05, "learning_rate": 3.2976588628762546e-06, "loss": 0.0, "step": 102800 }, { "epoch": 10.29, "grad_norm": 1.692805381026119e-05, "learning_rate": 3.2959866220735786e-06, "loss": 0.0046, "step": 102900 }, { "epoch": 10.3, "grad_norm": 5.216592035139911e-05, "learning_rate": 3.2943143812709035e-06, "loss": 0.0004, "step": 103000 }, { "epoch": 10.31, "grad_norm": 1.638688081584405e-05, "learning_rate": 3.292642140468228e-06, "loss": 0.005, "step": 103100 }, { "epoch": 10.32, "grad_norm": 1.8229413399240002e-05, "learning_rate": 3.290969899665552e-06, "loss": 0.0034, "step": 103200 }, { "epoch": 10.33, "grad_norm": 2.5418792574782856e-05, "learning_rate": 3.2892976588628765e-06, "loss": 0.0, "step": 103300 }, { "epoch": 10.34, "grad_norm": 74.45674896240234, "learning_rate": 3.287625418060201e-06, "loss": 0.0064, "step": 103400 }, { "epoch": 10.35, "grad_norm": 1.6789086657809094e-05, "learning_rate": 3.2859531772575254e-06, "loss": 0.0046, "step": 103500 }, { "epoch": 10.36, "grad_norm": 4.680682104662992e-05, "learning_rate": 3.28428093645485e-06, "loss": 0.0043, "step": 103600 }, { "epoch": 10.37, "grad_norm": 2.3295109713217244e-05, "learning_rate": 3.2826086956521743e-06, "loss": 0.0049, "step": 103700 }, { "epoch": 10.38, "grad_norm": 4.589736636262387e-05, "learning_rate": 3.2809364548494988e-06, "loss": 0.0, "step": 103800 }, { "epoch": 10.39, "grad_norm": 2.4069267965387553e-05, "learning_rate": 3.279264214046823e-06, "loss": 0.0, "step": 103900 }, { "epoch": 10.4, "grad_norm": 1.4721795196237508e-05, "learning_rate": 3.2775919732441473e-06, "loss": 0.0, "step": 104000 }, { "epoch": 10.41, "grad_norm": 1.2814829460694455e-05, "learning_rate": 3.275919732441472e-06, "loss": 0.0, "step": 104100 }, { "epoch": 10.42, "grad_norm": 1.1469286619103514e-05, "learning_rate": 3.274247491638796e-06, "loss": 0.0066, "step": 104200 }, { "epoch": 10.43, "grad_norm": 2.0519388272077776e-05, "learning_rate": 3.2725752508361206e-06, "loss": 0.0025, "step": 104300 }, { "epoch": 10.44, "grad_norm": 0.004769354593008757, "learning_rate": 3.270903010033445e-06, "loss": 0.0079, "step": 104400 }, { "epoch": 10.45, "grad_norm": 2.9404161978163756e-05, "learning_rate": 3.2692307692307696e-06, "loss": 0.0028, "step": 104500 }, { "epoch": 10.46, "grad_norm": 1.2868718840763904e-05, "learning_rate": 3.267558528428094e-06, "loss": 0.0, "step": 104600 }, { "epoch": 10.47, "grad_norm": 2.208415389759466e-05, "learning_rate": 3.265886287625418e-06, "loss": 0.0033, "step": 104700 }, { "epoch": 10.48, "grad_norm": 1.746260022628121e-05, "learning_rate": 3.264214046822743e-06, "loss": 0.0078, "step": 104800 }, { "epoch": 10.49, "grad_norm": 3.7256126233842224e-05, "learning_rate": 3.262541806020067e-06, "loss": 0.0, "step": 104900 }, { "epoch": 10.5, "grad_norm": 1.3822974324284587e-05, "learning_rate": 3.2608695652173914e-06, "loss": 0.0001, "step": 105000 }, { "epoch": 10.5, "eval_accuracy": 0.986725, "eval_f1": 0.986725, "eval_loss": 0.1294260323047638, "eval_runtime": 135.4142, "eval_samples_per_second": 295.39, "eval_steps_per_second": 295.39, "step": 105000 }, { "epoch": 10.51, "grad_norm": 1.273143516300479e-05, "learning_rate": 3.2591973244147163e-06, "loss": 0.0, "step": 105100 }, { "epoch": 10.52, "grad_norm": 1.8719376384979114e-05, "learning_rate": 3.2575250836120404e-06, "loss": 0.0003, "step": 105200 }, { "epoch": 10.53, "grad_norm": 2.3224081814987585e-05, "learning_rate": 3.255852842809365e-06, "loss": 0.0026, "step": 105300 }, { "epoch": 10.54, "grad_norm": 0.00043168303091078997, "learning_rate": 3.254180602006689e-06, "loss": 0.002, "step": 105400 }, { "epoch": 10.55, "grad_norm": 1.3707303878618404e-05, "learning_rate": 3.2525083612040137e-06, "loss": 0.0034, "step": 105500 }, { "epoch": 10.56, "grad_norm": 9.85491533356253e-06, "learning_rate": 3.250836120401338e-06, "loss": 0.0, "step": 105600 }, { "epoch": 10.57, "grad_norm": 0.00011145220196340233, "learning_rate": 3.2491638795986622e-06, "loss": 0.0059, "step": 105700 }, { "epoch": 10.58, "grad_norm": 2.3184100427897647e-05, "learning_rate": 3.247491638795987e-06, "loss": 0.0065, "step": 105800 }, { "epoch": 10.59, "grad_norm": 1.6724783563404344e-05, "learning_rate": 3.245819397993311e-06, "loss": 0.0037, "step": 105900 }, { "epoch": 10.6, "grad_norm": 1.4956675840949174e-05, "learning_rate": 3.2441471571906356e-06, "loss": 0.0074, "step": 106000 }, { "epoch": 10.61, "grad_norm": 3.993123027612455e-05, "learning_rate": 3.2424749163879597e-06, "loss": 0.0062, "step": 106100 }, { "epoch": 10.62, "grad_norm": 4.653423820855096e-05, "learning_rate": 3.2408026755852845e-06, "loss": 0.0011, "step": 106200 }, { "epoch": 10.63, "grad_norm": 0.0006327496375888586, "learning_rate": 3.239130434782609e-06, "loss": 0.0, "step": 106300 }, { "epoch": 10.64, "grad_norm": 3.2069776352727786e-05, "learning_rate": 3.237458193979933e-06, "loss": 0.0181, "step": 106400 }, { "epoch": 10.65, "grad_norm": 0.0016070237616077065, "learning_rate": 3.235785953177258e-06, "loss": 0.0018, "step": 106500 }, { "epoch": 10.66, "grad_norm": 0.00011340404307702556, "learning_rate": 3.234113712374582e-06, "loss": 0.0, "step": 106600 }, { "epoch": 10.67, "grad_norm": 1.2554538443509955e-05, "learning_rate": 3.2324414715719064e-06, "loss": 0.0, "step": 106700 }, { "epoch": 10.68, "grad_norm": 0.0037620030343532562, "learning_rate": 3.2307692307692313e-06, "loss": 0.0, "step": 106800 }, { "epoch": 10.69, "grad_norm": 1.1234938938287087e-05, "learning_rate": 3.2290969899665553e-06, "loss": 0.0, "step": 106900 }, { "epoch": 10.7, "grad_norm": 9.472200872551184e-06, "learning_rate": 3.22742474916388e-06, "loss": 0.0, "step": 107000 }, { "epoch": 10.71, "grad_norm": 1.4786013707634993e-05, "learning_rate": 3.225752508361204e-06, "loss": 0.0001, "step": 107100 }, { "epoch": 10.72, "grad_norm": 0.00032294646371155977, "learning_rate": 3.2240802675585287e-06, "loss": 0.0033, "step": 107200 }, { "epoch": 10.73, "grad_norm": 3.9019418181851506e-05, "learning_rate": 3.222408026755853e-06, "loss": 0.0047, "step": 107300 }, { "epoch": 10.74, "grad_norm": 2.2364813048625365e-05, "learning_rate": 3.2207357859531772e-06, "loss": 0.005, "step": 107400 }, { "epoch": 10.75, "grad_norm": 9.266052984457929e-06, "learning_rate": 3.219063545150502e-06, "loss": 0.0, "step": 107500 }, { "epoch": 10.76, "grad_norm": 2.405466693744529e-05, "learning_rate": 3.217391304347826e-06, "loss": 0.0011, "step": 107600 }, { "epoch": 10.77, "grad_norm": 1.7374895833199844e-05, "learning_rate": 3.2157190635451506e-06, "loss": 0.0, "step": 107700 }, { "epoch": 10.78, "grad_norm": 1.3817982107866555e-05, "learning_rate": 3.2140468227424755e-06, "loss": 0.0109, "step": 107800 }, { "epoch": 10.79, "grad_norm": 1.589250678080134e-05, "learning_rate": 3.2123745819397995e-06, "loss": 0.0, "step": 107900 }, { "epoch": 10.8, "grad_norm": 5.421064997790381e-05, "learning_rate": 3.210702341137124e-06, "loss": 0.0, "step": 108000 }, { "epoch": 10.81, "grad_norm": 0.00010446009400766343, "learning_rate": 3.209030100334448e-06, "loss": 0.0, "step": 108100 }, { "epoch": 10.82, "grad_norm": 1.0952360753435642e-05, "learning_rate": 3.207357859531773e-06, "loss": 0.0019, "step": 108200 }, { "epoch": 10.83, "grad_norm": 0.0003366768651176244, "learning_rate": 3.2056856187290974e-06, "loss": 0.0082, "step": 108300 }, { "epoch": 10.84, "grad_norm": 0.004733449313789606, "learning_rate": 3.2040133779264214e-06, "loss": 0.0, "step": 108400 }, { "epoch": 10.85, "grad_norm": 9.548472007736564e-05, "learning_rate": 3.2023411371237463e-06, "loss": 0.0, "step": 108500 }, { "epoch": 10.86, "grad_norm": 0.00032680193544365466, "learning_rate": 3.2006688963210703e-06, "loss": 0.0032, "step": 108600 }, { "epoch": 10.87, "grad_norm": 2.7095329642179422e-05, "learning_rate": 3.1989966555183948e-06, "loss": 0.0032, "step": 108700 }, { "epoch": 10.88, "grad_norm": 3.111539990641177e-05, "learning_rate": 3.1973244147157197e-06, "loss": 0.0044, "step": 108800 }, { "epoch": 10.89, "grad_norm": 0.0001842112251324579, "learning_rate": 3.1956521739130437e-06, "loss": 0.0, "step": 108900 }, { "epoch": 10.9, "grad_norm": 4.6635905164293945e-05, "learning_rate": 3.193979933110368e-06, "loss": 0.0046, "step": 109000 }, { "epoch": 10.91, "grad_norm": 9.20000093174167e-06, "learning_rate": 3.192307692307692e-06, "loss": 0.0, "step": 109100 }, { "epoch": 10.92, "grad_norm": 1.49307834362844e-05, "learning_rate": 3.190635451505017e-06, "loss": 0.0068, "step": 109200 }, { "epoch": 10.93, "grad_norm": 1.689198506937828e-05, "learning_rate": 3.1889632107023415e-06, "loss": 0.0048, "step": 109300 }, { "epoch": 10.94, "grad_norm": 0.0002182431344408542, "learning_rate": 3.1872909698996656e-06, "loss": 0.0, "step": 109400 }, { "epoch": 10.95, "grad_norm": 1.8919832655228674e-05, "learning_rate": 3.1856187290969905e-06, "loss": 0.0, "step": 109500 }, { "epoch": 10.96, "grad_norm": 0.0008669137023389339, "learning_rate": 3.1839464882943145e-06, "loss": 0.0029, "step": 109600 }, { "epoch": 10.97, "grad_norm": 7.572957838419825e-05, "learning_rate": 3.182274247491639e-06, "loss": 0.0, "step": 109700 }, { "epoch": 10.98, "grad_norm": 0.17629992961883545, "learning_rate": 3.180602006688964e-06, "loss": 0.0002, "step": 109800 }, { "epoch": 10.99, "grad_norm": 1.813526796468068e-05, "learning_rate": 3.178929765886288e-06, "loss": 0.0052, "step": 109900 }, { "epoch": 11.0, "grad_norm": 0.00025834888219833374, "learning_rate": 3.1772575250836123e-06, "loss": 0.0017, "step": 110000 }, { "epoch": 11.0, "eval_accuracy": 0.98675, "eval_f1": 0.98675, "eval_loss": 0.13637404143810272, "eval_runtime": 133.3484, "eval_samples_per_second": 299.966, "eval_steps_per_second": 299.966, "step": 110000 }, { "epoch": 11.01, "grad_norm": 2.8174052204121836e-05, "learning_rate": 3.1755852842809364e-06, "loss": 0.0036, "step": 110100 }, { "epoch": 11.02, "grad_norm": 6.247372948564589e-05, "learning_rate": 3.1739130434782613e-06, "loss": 0.0021, "step": 110200 }, { "epoch": 11.03, "grad_norm": 1.2338041415205225e-05, "learning_rate": 3.1722408026755857e-06, "loss": 0.0, "step": 110300 }, { "epoch": 11.04, "grad_norm": 3.7150435673538595e-05, "learning_rate": 3.1705685618729098e-06, "loss": 0.0053, "step": 110400 }, { "epoch": 11.05, "grad_norm": 0.01229814998805523, "learning_rate": 3.1688963210702347e-06, "loss": 0.0, "step": 110500 }, { "epoch": 11.06, "grad_norm": 7.667613317607902e-06, "learning_rate": 3.1672240802675587e-06, "loss": 0.0041, "step": 110600 }, { "epoch": 11.07, "grad_norm": 1.0412702977191657e-05, "learning_rate": 3.165551839464883e-06, "loss": 0.0, "step": 110700 }, { "epoch": 11.08, "grad_norm": 9.122279152506962e-06, "learning_rate": 3.163879598662208e-06, "loss": 0.0, "step": 110800 }, { "epoch": 11.09, "grad_norm": 2.4620214389869943e-05, "learning_rate": 3.162207357859532e-06, "loss": 0.0002, "step": 110900 }, { "epoch": 11.1, "grad_norm": 0.002043084939941764, "learning_rate": 3.1605351170568565e-06, "loss": 0.0028, "step": 111000 }, { "epoch": 11.11, "grad_norm": 7.240785180329112e-06, "learning_rate": 3.1588628762541806e-06, "loss": 0.0, "step": 111100 }, { "epoch": 11.12, "grad_norm": 1.0734827810665593e-05, "learning_rate": 3.1571906354515055e-06, "loss": 0.0, "step": 111200 }, { "epoch": 11.13, "grad_norm": 8.010079909581691e-05, "learning_rate": 3.15551839464883e-06, "loss": 0.0064, "step": 111300 }, { "epoch": 11.14, "grad_norm": 7.866760824981611e-06, "learning_rate": 3.153846153846154e-06, "loss": 0.0022, "step": 111400 }, { "epoch": 11.15, "grad_norm": 9.745773240865674e-06, "learning_rate": 3.152173913043479e-06, "loss": 0.0, "step": 111500 }, { "epoch": 11.16, "grad_norm": 2.4606217266409658e-05, "learning_rate": 3.150501672240803e-06, "loss": 0.0121, "step": 111600 }, { "epoch": 11.17, "grad_norm": 0.0005854077753610909, "learning_rate": 3.1488294314381273e-06, "loss": 0.0036, "step": 111700 }, { "epoch": 11.18, "grad_norm": 0.00012307950237300247, "learning_rate": 3.147157190635452e-06, "loss": 0.0059, "step": 111800 }, { "epoch": 11.19, "grad_norm": 0.00010187952284468338, "learning_rate": 3.1454849498327763e-06, "loss": 0.0, "step": 111900 }, { "epoch": 11.2, "grad_norm": 2.967399086628575e-05, "learning_rate": 3.1438127090301007e-06, "loss": 0.0, "step": 112000 }, { "epoch": 11.21, "grad_norm": 8.619118307251483e-05, "learning_rate": 3.1421404682274247e-06, "loss": 0.0, "step": 112100 }, { "epoch": 11.22, "grad_norm": 3.520284371916205e-05, "learning_rate": 3.1404682274247496e-06, "loss": 0.0002, "step": 112200 }, { "epoch": 11.23, "grad_norm": 6.373329961206764e-05, "learning_rate": 3.138795986622074e-06, "loss": 0.0098, "step": 112300 }, { "epoch": 11.24, "grad_norm": 0.0014678030274808407, "learning_rate": 3.137123745819398e-06, "loss": 0.0078, "step": 112400 }, { "epoch": 11.25, "grad_norm": 0.00016898875765036792, "learning_rate": 3.1354515050167226e-06, "loss": 0.0127, "step": 112500 }, { "epoch": 11.26, "grad_norm": 1.7103437130572274e-05, "learning_rate": 3.133779264214047e-06, "loss": 0.0, "step": 112600 }, { "epoch": 11.27, "grad_norm": 3.9083075535018e-05, "learning_rate": 3.1321070234113715e-06, "loss": 0.0069, "step": 112700 }, { "epoch": 11.28, "grad_norm": 0.11483018845319748, "learning_rate": 3.130434782608696e-06, "loss": 0.0003, "step": 112800 }, { "epoch": 11.29, "grad_norm": 0.09915011376142502, "learning_rate": 3.1287625418060204e-06, "loss": 0.0035, "step": 112900 }, { "epoch": 11.3, "grad_norm": 0.0055434671230614185, "learning_rate": 3.127090301003345e-06, "loss": 0.005, "step": 113000 }, { "epoch": 11.31, "grad_norm": 2.727087849052623e-05, "learning_rate": 3.125418060200669e-06, "loss": 0.0036, "step": 113100 }, { "epoch": 11.32, "grad_norm": 4.718463242170401e-05, "learning_rate": 3.1237458193979934e-06, "loss": 0.0, "step": 113200 }, { "epoch": 11.33, "grad_norm": 3.375914820935577e-05, "learning_rate": 3.1220735785953183e-06, "loss": 0.0031, "step": 113300 }, { "epoch": 11.34, "grad_norm": 1.1921174518647604e-05, "learning_rate": 3.1204013377926423e-06, "loss": 0.0, "step": 113400 }, { "epoch": 11.35, "grad_norm": 0.00011794619058491662, "learning_rate": 3.1187290969899668e-06, "loss": 0.0, "step": 113500 }, { "epoch": 11.36, "grad_norm": 0.0018074008403345942, "learning_rate": 3.1170568561872912e-06, "loss": 0.0038, "step": 113600 }, { "epoch": 11.37, "grad_norm": 1.8831220586434938e-05, "learning_rate": 3.1153846153846157e-06, "loss": 0.0031, "step": 113700 }, { "epoch": 11.38, "grad_norm": 0.00034093132126145065, "learning_rate": 3.11371237458194e-06, "loss": 0.0075, "step": 113800 }, { "epoch": 11.39, "grad_norm": 0.0013741998700425029, "learning_rate": 3.112040133779264e-06, "loss": 0.0067, "step": 113900 }, { "epoch": 11.4, "grad_norm": 1.7773139916243963e-05, "learning_rate": 3.110367892976589e-06, "loss": 0.0066, "step": 114000 }, { "epoch": 11.41, "grad_norm": 0.000126648650621064, "learning_rate": 3.108695652173913e-06, "loss": 0.0066, "step": 114100 }, { "epoch": 11.42, "grad_norm": 0.0001225349842570722, "learning_rate": 3.1070234113712376e-06, "loss": 0.0035, "step": 114200 }, { "epoch": 11.43, "grad_norm": 5.7315519370604306e-05, "learning_rate": 3.1053511705685625e-06, "loss": 0.0034, "step": 114300 }, { "epoch": 11.44, "grad_norm": 3.917821231880225e-05, "learning_rate": 3.1036789297658865e-06, "loss": 0.0043, "step": 114400 }, { "epoch": 11.45, "grad_norm": 0.0019230084726586938, "learning_rate": 3.102006688963211e-06, "loss": 0.0016, "step": 114500 }, { "epoch": 11.46, "grad_norm": 2.3124792278395034e-05, "learning_rate": 3.100334448160535e-06, "loss": 0.0, "step": 114600 }, { "epoch": 11.47, "grad_norm": 3.644728712970391e-05, "learning_rate": 3.09866220735786e-06, "loss": 0.0, "step": 114700 }, { "epoch": 11.48, "grad_norm": 0.04555916041135788, "learning_rate": 3.0969899665551843e-06, "loss": 0.0026, "step": 114800 }, { "epoch": 11.49, "grad_norm": 3.428834315855056e-05, "learning_rate": 3.0953177257525084e-06, "loss": 0.0017, "step": 114900 }, { "epoch": 11.5, "grad_norm": 1.4589694728783797e-05, "learning_rate": 3.0936454849498333e-06, "loss": 0.0033, "step": 115000 }, { "epoch": 11.5, "eval_accuracy": 0.987075, "eval_f1": 0.987075, "eval_loss": 0.12469487637281418, "eval_runtime": 134.1209, "eval_samples_per_second": 298.238, "eval_steps_per_second": 298.238, "step": 115000 }, { "epoch": 11.51, "grad_norm": 1.1125358469143976e-05, "learning_rate": 3.0919732441471573e-06, "loss": 0.0, "step": 115100 }, { "epoch": 11.52, "grad_norm": 2.2134607206680812e-05, "learning_rate": 3.0903010033444818e-06, "loss": 0.0063, "step": 115200 }, { "epoch": 11.53, "grad_norm": 1.5470241123693995e-05, "learning_rate": 3.0886287625418066e-06, "loss": 0.0043, "step": 115300 }, { "epoch": 11.54, "grad_norm": 2.767572550510522e-05, "learning_rate": 3.0869565217391307e-06, "loss": 0.0046, "step": 115400 }, { "epoch": 11.55, "grad_norm": 4.574981358018704e-05, "learning_rate": 3.085284280936455e-06, "loss": 0.0018, "step": 115500 }, { "epoch": 11.56, "grad_norm": 0.00040606813854537904, "learning_rate": 3.083612040133779e-06, "loss": 0.009, "step": 115600 }, { "epoch": 11.57, "grad_norm": 8.138959674397483e-05, "learning_rate": 3.081939799331104e-06, "loss": 0.0, "step": 115700 }, { "epoch": 11.58, "grad_norm": 0.0002813829341903329, "learning_rate": 3.080267558528428e-06, "loss": 0.007, "step": 115800 }, { "epoch": 11.59, "grad_norm": 0.00013545318506658077, "learning_rate": 3.0785953177257526e-06, "loss": 0.0056, "step": 115900 }, { "epoch": 11.6, "grad_norm": 8.718832395970821e-05, "learning_rate": 3.0769230769230774e-06, "loss": 0.0074, "step": 116000 }, { "epoch": 11.61, "grad_norm": 0.049765247851610184, "learning_rate": 3.0752508361204015e-06, "loss": 0.0013, "step": 116100 }, { "epoch": 11.62, "grad_norm": 0.12891064584255219, "learning_rate": 3.073578595317726e-06, "loss": 0.0038, "step": 116200 }, { "epoch": 11.63, "grad_norm": 0.010939703322947025, "learning_rate": 3.07190635451505e-06, "loss": 0.0016, "step": 116300 }, { "epoch": 11.64, "grad_norm": 0.0008908117306418717, "learning_rate": 3.070234113712375e-06, "loss": 0.0042, "step": 116400 }, { "epoch": 11.65, "grad_norm": 4.410389010445215e-05, "learning_rate": 3.0685618729096993e-06, "loss": 0.0019, "step": 116500 }, { "epoch": 11.66, "grad_norm": 3.8732574466848746e-05, "learning_rate": 3.0668896321070234e-06, "loss": 0.0022, "step": 116600 }, { "epoch": 11.67, "grad_norm": 4.904616071144119e-05, "learning_rate": 3.0652173913043482e-06, "loss": 0.0032, "step": 116700 }, { "epoch": 11.68, "grad_norm": 5.659882663167082e-05, "learning_rate": 3.0635451505016723e-06, "loss": 0.0079, "step": 116800 }, { "epoch": 11.69, "grad_norm": 4.263202572474256e-05, "learning_rate": 3.0618729096989967e-06, "loss": 0.0058, "step": 116900 }, { "epoch": 11.7, "grad_norm": 2.6740566681837663e-05, "learning_rate": 3.0602006688963216e-06, "loss": 0.0, "step": 117000 }, { "epoch": 11.71, "grad_norm": 0.0026203403249382973, "learning_rate": 3.0585284280936457e-06, "loss": 0.007, "step": 117100 }, { "epoch": 11.72, "grad_norm": 4.1611296182964e-05, "learning_rate": 3.05685618729097e-06, "loss": 0.0, "step": 117200 }, { "epoch": 11.73, "grad_norm": 0.0006726513965986669, "learning_rate": 3.055183946488294e-06, "loss": 0.0, "step": 117300 }, { "epoch": 11.74, "grad_norm": 0.0001233662769664079, "learning_rate": 3.053511705685619e-06, "loss": 0.0, "step": 117400 }, { "epoch": 11.75, "grad_norm": 1.4310979167930782e-05, "learning_rate": 3.0518394648829435e-06, "loss": 0.0, "step": 117500 }, { "epoch": 11.76, "grad_norm": 7.153803016990423e-05, "learning_rate": 3.0501672240802675e-06, "loss": 0.0, "step": 117600 }, { "epoch": 11.77, "grad_norm": 2.3614837118657306e-05, "learning_rate": 3.0484949832775924e-06, "loss": 0.0027, "step": 117700 }, { "epoch": 11.78, "grad_norm": 0.00029216796974651515, "learning_rate": 3.0468227424749165e-06, "loss": 0.0041, "step": 117800 }, { "epoch": 11.79, "grad_norm": 2.791116821754258e-05, "learning_rate": 3.045150501672241e-06, "loss": 0.0, "step": 117900 }, { "epoch": 11.8, "grad_norm": 4.38535789726302e-05, "learning_rate": 3.043478260869566e-06, "loss": 0.0037, "step": 118000 }, { "epoch": 11.81, "grad_norm": 0.00027548184152692556, "learning_rate": 3.04180602006689e-06, "loss": 0.0092, "step": 118100 }, { "epoch": 11.82, "grad_norm": 7.698433910263702e-05, "learning_rate": 3.0401337792642143e-06, "loss": 0.0002, "step": 118200 }, { "epoch": 11.83, "grad_norm": 2.1024501620559022e-05, "learning_rate": 3.0384615384615383e-06, "loss": 0.0, "step": 118300 }, { "epoch": 11.84, "grad_norm": 0.0005384383839555085, "learning_rate": 3.0367892976588632e-06, "loss": 0.0, "step": 118400 }, { "epoch": 11.85, "grad_norm": 3.6589870433090255e-05, "learning_rate": 3.0351170568561877e-06, "loss": 0.0, "step": 118500 }, { "epoch": 11.86, "grad_norm": 1.6869857063284144e-05, "learning_rate": 3.0334448160535117e-06, "loss": 0.0, "step": 118600 }, { "epoch": 11.87, "grad_norm": 1.1717855159076862e-05, "learning_rate": 3.0317725752508366e-06, "loss": 0.0, "step": 118700 }, { "epoch": 11.88, "grad_norm": 2.092311297019478e-05, "learning_rate": 3.0301003344481606e-06, "loss": 0.0, "step": 118800 }, { "epoch": 11.89, "grad_norm": 0.00024131260579451919, "learning_rate": 3.028428093645485e-06, "loss": 0.0, "step": 118900 }, { "epoch": 11.9, "grad_norm": 5.391419108491391e-05, "learning_rate": 3.02675585284281e-06, "loss": 0.0038, "step": 119000 }, { "epoch": 11.91, "grad_norm": 0.00020960722758900374, "learning_rate": 3.025083612040134e-06, "loss": 0.0053, "step": 119100 }, { "epoch": 11.92, "grad_norm": 0.00011951918713748455, "learning_rate": 3.0234113712374585e-06, "loss": 0.004, "step": 119200 }, { "epoch": 11.93, "grad_norm": 3.663140887510963e-05, "learning_rate": 3.0217391304347825e-06, "loss": 0.0, "step": 119300 }, { "epoch": 11.94, "grad_norm": 4.4150354369776323e-05, "learning_rate": 3.0200668896321074e-06, "loss": 0.0044, "step": 119400 }, { "epoch": 11.95, "grad_norm": 5.2497161959763616e-05, "learning_rate": 3.018394648829432e-06, "loss": 0.0024, "step": 119500 }, { "epoch": 11.96, "grad_norm": 4.427917883731425e-05, "learning_rate": 3.016722408026756e-06, "loss": 0.0, "step": 119600 }, { "epoch": 11.97, "grad_norm": 2.8848799047409557e-05, "learning_rate": 3.0150501672240808e-06, "loss": 0.0018, "step": 119700 }, { "epoch": 11.98, "grad_norm": 1.0785791346279439e-05, "learning_rate": 3.013377926421405e-06, "loss": 0.0, "step": 119800 }, { "epoch": 11.99, "grad_norm": 33.651451110839844, "learning_rate": 3.0117056856187293e-06, "loss": 0.006, "step": 119900 }, { "epoch": 12.0, "grad_norm": 0.00619711447507143, "learning_rate": 3.010033444816054e-06, "loss": 0.0, "step": 120000 }, { "epoch": 12.0, "eval_accuracy": 0.9865, "eval_f1": 0.9865, "eval_loss": 0.1380569487810135, "eval_runtime": 133.986, "eval_samples_per_second": 298.539, "eval_steps_per_second": 298.539, "step": 120000 }, { "epoch": 12.01, "grad_norm": 1.0006710908783134e-05, "learning_rate": 3.008361204013378e-06, "loss": 0.0, "step": 120100 }, { "epoch": 12.02, "grad_norm": 0.0002895425714086741, "learning_rate": 3.0066889632107027e-06, "loss": 0.0073, "step": 120200 }, { "epoch": 12.03, "grad_norm": 6.075893543311395e-05, "learning_rate": 3.0050167224080267e-06, "loss": 0.0046, "step": 120300 }, { "epoch": 12.04, "grad_norm": 2.0078807210666128e-05, "learning_rate": 3.0033444816053516e-06, "loss": 0.0, "step": 120400 }, { "epoch": 12.05, "grad_norm": 1.8073727915179916e-05, "learning_rate": 3.001672240802676e-06, "loss": 0.0, "step": 120500 }, { "epoch": 12.06, "grad_norm": 2.0246203348506242e-05, "learning_rate": 3e-06, "loss": 0.0024, "step": 120600 }, { "epoch": 12.07, "grad_norm": 0.0022137134801596403, "learning_rate": 2.998327759197325e-06, "loss": 0.0048, "step": 120700 }, { "epoch": 12.08, "grad_norm": 1.632177190913353e-05, "learning_rate": 2.996655518394649e-06, "loss": 0.0051, "step": 120800 }, { "epoch": 12.09, "grad_norm": 0.009438976645469666, "learning_rate": 2.9949832775919735e-06, "loss": 0.0046, "step": 120900 }, { "epoch": 12.1, "grad_norm": 2.1821919290232472e-05, "learning_rate": 2.9933110367892983e-06, "loss": 0.0003, "step": 121000 }, { "epoch": 12.11, "grad_norm": 0.00013410099199973047, "learning_rate": 2.9916387959866224e-06, "loss": 0.0023, "step": 121100 }, { "epoch": 12.12, "grad_norm": 3.4450891689630225e-05, "learning_rate": 2.989966555183947e-06, "loss": 0.0001, "step": 121200 }, { "epoch": 12.13, "grad_norm": 0.010932223871350288, "learning_rate": 2.988294314381271e-06, "loss": 0.0, "step": 121300 }, { "epoch": 12.14, "grad_norm": 4.724752943729982e-05, "learning_rate": 2.9866220735785958e-06, "loss": 0.0, "step": 121400 }, { "epoch": 12.15, "grad_norm": 1.1601387996051926e-05, "learning_rate": 2.9849498327759202e-06, "loss": 0.0, "step": 121500 }, { "epoch": 12.16, "grad_norm": 4.233758227201179e-05, "learning_rate": 2.9832775919732443e-06, "loss": 0.0, "step": 121600 }, { "epoch": 12.17, "grad_norm": 2.8649757950915955e-05, "learning_rate": 2.9816053511705687e-06, "loss": 0.0, "step": 121700 }, { "epoch": 12.18, "grad_norm": 4.887601608061232e-05, "learning_rate": 2.979933110367893e-06, "loss": 0.0111, "step": 121800 }, { "epoch": 12.19, "grad_norm": 0.016880473122000694, "learning_rate": 2.9782608695652176e-06, "loss": 0.0, "step": 121900 }, { "epoch": 12.2, "grad_norm": 1.5798104868736118e-05, "learning_rate": 2.976588628762542e-06, "loss": 0.0018, "step": 122000 }, { "epoch": 12.21, "grad_norm": 2.9448170607793145e-05, "learning_rate": 2.9749163879598666e-06, "loss": 0.0, "step": 122100 }, { "epoch": 12.22, "grad_norm": 2.1456729882629588e-05, "learning_rate": 2.973244147157191e-06, "loss": 0.0, "step": 122200 }, { "epoch": 12.23, "grad_norm": 1.5617501048836857e-05, "learning_rate": 2.971571906354515e-06, "loss": 0.0, "step": 122300 }, { "epoch": 12.24, "grad_norm": 0.00031002156902104616, "learning_rate": 2.9698996655518395e-06, "loss": 0.0133, "step": 122400 }, { "epoch": 12.25, "grad_norm": 1.7442815078538842e-05, "learning_rate": 2.9682274247491644e-06, "loss": 0.0034, "step": 122500 }, { "epoch": 12.26, "grad_norm": 3.2786520023364574e-05, "learning_rate": 2.9665551839464884e-06, "loss": 0.0035, "step": 122600 }, { "epoch": 12.27, "grad_norm": 8.906285074772313e-05, "learning_rate": 2.964882943143813e-06, "loss": 0.0023, "step": 122700 }, { "epoch": 12.28, "grad_norm": 0.0013216121587902308, "learning_rate": 2.9632107023411374e-06, "loss": 0.0031, "step": 122800 }, { "epoch": 12.29, "grad_norm": 2.5754599846550263e-05, "learning_rate": 2.961538461538462e-06, "loss": 0.0068, "step": 122900 }, { "epoch": 12.3, "grad_norm": 0.000346259621437639, "learning_rate": 2.9598662207357863e-06, "loss": 0.0009, "step": 123000 }, { "epoch": 12.31, "grad_norm": 0.0002841234963852912, "learning_rate": 2.9581939799331103e-06, "loss": 0.0091, "step": 123100 }, { "epoch": 12.32, "grad_norm": 0.0008099031983874738, "learning_rate": 2.956521739130435e-06, "loss": 0.0, "step": 123200 }, { "epoch": 12.33, "grad_norm": 2.596194281068165e-05, "learning_rate": 2.9548494983277592e-06, "loss": 0.0, "step": 123300 }, { "epoch": 12.34, "grad_norm": 1.770333619788289e-05, "learning_rate": 2.9531772575250837e-06, "loss": 0.0, "step": 123400 }, { "epoch": 12.35, "grad_norm": 1.7277186998398975e-05, "learning_rate": 2.9515050167224086e-06, "loss": 0.0, "step": 123500 }, { "epoch": 12.36, "grad_norm": 0.0003047818026971072, "learning_rate": 2.9498327759197326e-06, "loss": 0.0006, "step": 123600 }, { "epoch": 12.37, "grad_norm": 4.139426528126933e-05, "learning_rate": 2.948160535117057e-06, "loss": 0.003, "step": 123700 }, { "epoch": 12.38, "grad_norm": 4.857347084907815e-05, "learning_rate": 2.946488294314381e-06, "loss": 0.0, "step": 123800 }, { "epoch": 12.39, "grad_norm": 1.1135462955280673e-05, "learning_rate": 2.944816053511706e-06, "loss": 0.0, "step": 123900 }, { "epoch": 12.4, "grad_norm": 4.5921427954453975e-05, "learning_rate": 2.9431438127090305e-06, "loss": 0.0, "step": 124000 }, { "epoch": 12.41, "grad_norm": 0.0011348113184794784, "learning_rate": 2.9414715719063545e-06, "loss": 0.0, "step": 124100 }, { "epoch": 12.42, "grad_norm": 9.929543011821806e-06, "learning_rate": 2.9397993311036794e-06, "loss": 0.0, "step": 124200 }, { "epoch": 12.43, "grad_norm": 0.0007508114213123918, "learning_rate": 2.9381270903010034e-06, "loss": 0.0023, "step": 124300 }, { "epoch": 12.44, "grad_norm": 0.0012188480468466878, "learning_rate": 2.936454849498328e-06, "loss": 0.0, "step": 124400 }, { "epoch": 12.45, "grad_norm": 1.107308162318077e-05, "learning_rate": 2.9347826086956528e-06, "loss": 0.0, "step": 124500 }, { "epoch": 12.46, "grad_norm": 1.0170252608077135e-05, "learning_rate": 2.933110367892977e-06, "loss": 0.0, "step": 124600 }, { "epoch": 12.47, "grad_norm": 9.492271055933088e-05, "learning_rate": 2.9314381270903013e-06, "loss": 0.0, "step": 124700 }, { "epoch": 12.48, "grad_norm": 0.0013374453410506248, "learning_rate": 2.9297658862876253e-06, "loss": 0.021, "step": 124800 }, { "epoch": 12.49, "grad_norm": 0.000961253943387419, "learning_rate": 2.92809364548495e-06, "loss": 0.0, "step": 124900 }, { "epoch": 12.5, "grad_norm": 0.00014077928790356964, "learning_rate": 2.9264214046822746e-06, "loss": 0.0002, "step": 125000 }, { "epoch": 12.5, "eval_accuracy": 0.98745, "eval_f1": 0.98745, "eval_loss": 0.11790119856595993, "eval_runtime": 133.865, "eval_samples_per_second": 298.809, "eval_steps_per_second": 298.809, "step": 125000 }, { "epoch": 12.51, "grad_norm": 0.00034569227136671543, "learning_rate": 2.9247491638795987e-06, "loss": 0.0033, "step": 125100 }, { "epoch": 12.52, "grad_norm": 2.6383528165752068e-05, "learning_rate": 2.9230769230769236e-06, "loss": 0.0, "step": 125200 }, { "epoch": 12.53, "grad_norm": 0.00013982444943394512, "learning_rate": 2.9214046822742476e-06, "loss": 0.0, "step": 125300 }, { "epoch": 12.54, "grad_norm": 1.3336039955902379e-05, "learning_rate": 2.919732441471572e-06, "loss": 0.0, "step": 125400 }, { "epoch": 12.55, "grad_norm": 0.00027246633544564247, "learning_rate": 2.918060200668897e-06, "loss": 0.0003, "step": 125500 }, { "epoch": 12.56, "grad_norm": 9.627802501199767e-05, "learning_rate": 2.916387959866221e-06, "loss": 0.0, "step": 125600 }, { "epoch": 12.57, "grad_norm": 1.2915442312078085e-05, "learning_rate": 2.9147157190635454e-06, "loss": 0.0025, "step": 125700 }, { "epoch": 12.58, "grad_norm": 1.4243492842069827e-05, "learning_rate": 2.9130434782608695e-06, "loss": 0.0, "step": 125800 }, { "epoch": 12.59, "grad_norm": 0.01041350793093443, "learning_rate": 2.9113712374581944e-06, "loss": 0.0, "step": 125900 }, { "epoch": 12.6, "grad_norm": 1.3331155969353858e-05, "learning_rate": 2.9096989966555184e-06, "loss": 0.0046, "step": 126000 }, { "epoch": 12.61, "grad_norm": 1.940794572874438e-05, "learning_rate": 2.908026755852843e-06, "loss": 0.0023, "step": 126100 }, { "epoch": 12.62, "grad_norm": 0.001803192775696516, "learning_rate": 2.9063545150501677e-06, "loss": 0.0024, "step": 126200 }, { "epoch": 12.63, "grad_norm": 3.6869932955596596e-05, "learning_rate": 2.9046822742474918e-06, "loss": 0.0047, "step": 126300 }, { "epoch": 12.64, "grad_norm": 1.732857163005974e-05, "learning_rate": 2.9030100334448162e-06, "loss": 0.0, "step": 126400 }, { "epoch": 12.65, "grad_norm": 0.00017873154138214886, "learning_rate": 2.9013377926421403e-06, "loss": 0.0088, "step": 126500 }, { "epoch": 12.66, "grad_norm": 2.3783226424711756e-05, "learning_rate": 2.899665551839465e-06, "loss": 0.0026, "step": 126600 }, { "epoch": 12.67, "grad_norm": 0.01853839121758938, "learning_rate": 2.8979933110367896e-06, "loss": 0.0029, "step": 126700 }, { "epoch": 12.68, "grad_norm": 4.210330007481389e-05, "learning_rate": 2.8963210702341137e-06, "loss": 0.0037, "step": 126800 }, { "epoch": 12.69, "grad_norm": 0.00028920164913870394, "learning_rate": 2.8946488294314385e-06, "loss": 0.0052, "step": 126900 }, { "epoch": 12.7, "grad_norm": 7.593008922412992e-05, "learning_rate": 2.8929765886287626e-06, "loss": 0.0133, "step": 127000 }, { "epoch": 12.71, "grad_norm": 20.229549407958984, "learning_rate": 2.891304347826087e-06, "loss": 0.0194, "step": 127100 }, { "epoch": 12.72, "grad_norm": 0.00024867980391718447, "learning_rate": 2.889632107023412e-06, "loss": 0.0001, "step": 127200 }, { "epoch": 12.73, "grad_norm": 0.160929873585701, "learning_rate": 2.887959866220736e-06, "loss": 0.0018, "step": 127300 }, { "epoch": 12.74, "grad_norm": 0.0001910285500343889, "learning_rate": 2.8862876254180604e-06, "loss": 0.0, "step": 127400 }, { "epoch": 12.75, "grad_norm": 0.002502140821889043, "learning_rate": 2.8846153846153845e-06, "loss": 0.0042, "step": 127500 }, { "epoch": 12.76, "grad_norm": 9.12977775442414e-05, "learning_rate": 2.8829431438127093e-06, "loss": 0.0028, "step": 127600 }, { "epoch": 12.77, "grad_norm": 0.0001754213881213218, "learning_rate": 2.881270903010034e-06, "loss": 0.0, "step": 127700 }, { "epoch": 12.78, "grad_norm": 2.844112168531865e-05, "learning_rate": 2.879598662207358e-06, "loss": 0.0, "step": 127800 }, { "epoch": 12.79, "grad_norm": 7.382730109384283e-05, "learning_rate": 2.8779264214046827e-06, "loss": 0.0036, "step": 127900 }, { "epoch": 12.8, "grad_norm": 6.222332740435377e-05, "learning_rate": 2.8762541806020068e-06, "loss": 0.0, "step": 128000 }, { "epoch": 12.81, "grad_norm": 2.280520311614964e-05, "learning_rate": 2.8745819397993312e-06, "loss": 0.0, "step": 128100 }, { "epoch": 12.82, "grad_norm": 1.8901395378634334e-05, "learning_rate": 2.872909698996656e-06, "loss": 0.0, "step": 128200 }, { "epoch": 12.83, "grad_norm": 2.6766725568450056e-05, "learning_rate": 2.87123745819398e-06, "loss": 0.0038, "step": 128300 }, { "epoch": 12.84, "grad_norm": 4.2255389416823164e-05, "learning_rate": 2.8695652173913046e-06, "loss": 0.0002, "step": 128400 }, { "epoch": 12.85, "grad_norm": 1.8130673197447322e-05, "learning_rate": 2.8678929765886286e-06, "loss": 0.0127, "step": 128500 }, { "epoch": 12.86, "grad_norm": 0.00020949130703229457, "learning_rate": 2.8662207357859535e-06, "loss": 0.0041, "step": 128600 }, { "epoch": 12.87, "grad_norm": 8.374531898880377e-05, "learning_rate": 2.864548494983278e-06, "loss": 0.0, "step": 128700 }, { "epoch": 12.88, "grad_norm": 0.005811138078570366, "learning_rate": 2.862876254180602e-06, "loss": 0.0006, "step": 128800 }, { "epoch": 12.89, "grad_norm": 5.1641036407090724e-05, "learning_rate": 2.861204013377927e-06, "loss": 0.0061, "step": 128900 }, { "epoch": 12.9, "grad_norm": 0.00028346985345706344, "learning_rate": 2.859531772575251e-06, "loss": 0.0019, "step": 129000 }, { "epoch": 12.91, "grad_norm": 2.3813428924768232e-05, "learning_rate": 2.8578595317725754e-06, "loss": 0.0, "step": 129100 }, { "epoch": 12.92, "grad_norm": 5.4004725825507194e-05, "learning_rate": 2.8561872909699003e-06, "loss": 0.0, "step": 129200 }, { "epoch": 12.93, "grad_norm": 0.0009715522755868733, "learning_rate": 2.8545150501672243e-06, "loss": 0.0, "step": 129300 }, { "epoch": 12.94, "grad_norm": 6.110258254921064e-05, "learning_rate": 2.8528428093645488e-06, "loss": 0.0, "step": 129400 }, { "epoch": 12.95, "grad_norm": 2.5454342903685756e-05, "learning_rate": 2.851170568561873e-06, "loss": 0.0075, "step": 129500 }, { "epoch": 12.96, "grad_norm": 2.0261213649064302e-05, "learning_rate": 2.8494983277591977e-06, "loss": 0.0003, "step": 129600 }, { "epoch": 12.97, "grad_norm": 0.012711210176348686, "learning_rate": 2.847826086956522e-06, "loss": 0.0028, "step": 129700 }, { "epoch": 12.98, "grad_norm": 0.9762109518051147, "learning_rate": 2.846153846153846e-06, "loss": 0.0101, "step": 129800 }, { "epoch": 12.99, "grad_norm": 0.003656335175037384, "learning_rate": 2.844481605351171e-06, "loss": 0.0001, "step": 129900 }, { "epoch": 13.0, "grad_norm": 1.8448445189278573e-05, "learning_rate": 2.842809364548495e-06, "loss": 0.0055, "step": 130000 }, { "epoch": 13.0, "eval_accuracy": 0.98705, "eval_f1": 0.98705, "eval_loss": 0.11996782571077347, "eval_runtime": 135.7284, "eval_samples_per_second": 294.706, "eval_steps_per_second": 294.706, "step": 130000 }, { "epoch": 13.01, "grad_norm": 1.713820529403165e-05, "learning_rate": 2.8411371237458196e-06, "loss": 0.0, "step": 130100 }, { "epoch": 13.02, "grad_norm": 4.685423482442275e-05, "learning_rate": 2.8394648829431445e-06, "loss": 0.0001, "step": 130200 }, { "epoch": 13.03, "grad_norm": 0.0014853087486699224, "learning_rate": 2.8377926421404685e-06, "loss": 0.0042, "step": 130300 }, { "epoch": 13.04, "grad_norm": 1.4580614333681297e-05, "learning_rate": 2.836120401337793e-06, "loss": 0.003, "step": 130400 }, { "epoch": 13.05, "grad_norm": 1.3438087080430705e-05, "learning_rate": 2.834448160535117e-06, "loss": 0.0, "step": 130500 }, { "epoch": 13.06, "grad_norm": 7.020106568234041e-05, "learning_rate": 2.832775919732442e-06, "loss": 0.0, "step": 130600 }, { "epoch": 13.07, "grad_norm": 2.4668675905559212e-05, "learning_rate": 2.8311036789297663e-06, "loss": 0.0004, "step": 130700 }, { "epoch": 13.08, "grad_norm": 0.00015571806579828262, "learning_rate": 2.8294314381270904e-06, "loss": 0.0026, "step": 130800 }, { "epoch": 13.09, "grad_norm": 6.426640902645886e-05, "learning_rate": 2.8277591973244153e-06, "loss": 0.0021, "step": 130900 }, { "epoch": 13.1, "grad_norm": 2.0575174858095124e-05, "learning_rate": 2.8260869565217393e-06, "loss": 0.0035, "step": 131000 }, { "epoch": 13.11, "grad_norm": 1.2000875358353369e-05, "learning_rate": 2.8244147157190638e-06, "loss": 0.0, "step": 131100 }, { "epoch": 13.12, "grad_norm": 1.4357594409375452e-05, "learning_rate": 2.8227424749163882e-06, "loss": 0.0068, "step": 131200 }, { "epoch": 13.13, "grad_norm": 0.01679055020213127, "learning_rate": 2.8210702341137127e-06, "loss": 0.0, "step": 131300 }, { "epoch": 13.14, "grad_norm": 3.117432424915023e-05, "learning_rate": 2.819397993311037e-06, "loss": 0.0001, "step": 131400 }, { "epoch": 13.15, "grad_norm": 1.0254871995130088e-05, "learning_rate": 2.817725752508361e-06, "loss": 0.0013, "step": 131500 }, { "epoch": 13.16, "grad_norm": 1.2311931641306728e-05, "learning_rate": 2.816053511705686e-06, "loss": 0.0024, "step": 131600 }, { "epoch": 13.17, "grad_norm": 1.4948631360311992e-05, "learning_rate": 2.8143812709030105e-06, "loss": 0.0036, "step": 131700 }, { "epoch": 13.18, "grad_norm": 3.637359623098746e-05, "learning_rate": 2.8127090301003346e-06, "loss": 0.0, "step": 131800 }, { "epoch": 13.19, "grad_norm": 5.812734889332205e-05, "learning_rate": 2.811036789297659e-06, "loss": 0.0, "step": 131900 }, { "epoch": 13.2, "grad_norm": 2.516805034247227e-05, "learning_rate": 2.8093645484949835e-06, "loss": 0.0, "step": 132000 }, { "epoch": 13.21, "grad_norm": 0.0016383231850340962, "learning_rate": 2.807692307692308e-06, "loss": 0.0, "step": 132100 }, { "epoch": 13.22, "grad_norm": 3.990402183262631e-05, "learning_rate": 2.8060200668896324e-06, "loss": 0.0, "step": 132200 }, { "epoch": 13.23, "grad_norm": 3.431649747653864e-05, "learning_rate": 2.804347826086957e-06, "loss": 0.0, "step": 132300 }, { "epoch": 13.24, "grad_norm": 4.9618389311945066e-05, "learning_rate": 2.8026755852842813e-06, "loss": 0.0029, "step": 132400 }, { "epoch": 13.25, "grad_norm": 1.1744758921850007e-05, "learning_rate": 2.8010033444816054e-06, "loss": 0.0023, "step": 132500 }, { "epoch": 13.26, "grad_norm": 0.0007372087566182017, "learning_rate": 2.79933110367893e-06, "loss": 0.0039, "step": 132600 }, { "epoch": 13.27, "grad_norm": 1.3846334695699625e-05, "learning_rate": 2.7976588628762547e-06, "loss": 0.0, "step": 132700 }, { "epoch": 13.28, "grad_norm": 0.00011616638948908076, "learning_rate": 2.7959866220735787e-06, "loss": 0.0, "step": 132800 }, { "epoch": 13.29, "grad_norm": 1.7173537344206125e-05, "learning_rate": 2.794314381270903e-06, "loss": 0.0063, "step": 132900 }, { "epoch": 13.3, "grad_norm": 7.09253508830443e-05, "learning_rate": 2.7926421404682277e-06, "loss": 0.0062, "step": 133000 }, { "epoch": 13.31, "grad_norm": 0.037380144000053406, "learning_rate": 2.790969899665552e-06, "loss": 0.0001, "step": 133100 }, { "epoch": 13.32, "grad_norm": 1.3415536159300245e-05, "learning_rate": 2.7892976588628766e-06, "loss": 0.0101, "step": 133200 }, { "epoch": 13.33, "grad_norm": 1.4683989320474211e-05, "learning_rate": 2.7876254180602006e-06, "loss": 0.0032, "step": 133300 }, { "epoch": 13.34, "grad_norm": 2.834025872289203e-05, "learning_rate": 2.7859531772575255e-06, "loss": 0.0, "step": 133400 }, { "epoch": 13.35, "grad_norm": 6.283698894549161e-05, "learning_rate": 2.7842809364548495e-06, "loss": 0.0031, "step": 133500 }, { "epoch": 13.36, "grad_norm": 3.11326548398938e-05, "learning_rate": 2.782608695652174e-06, "loss": 0.0, "step": 133600 }, { "epoch": 13.37, "grad_norm": 0.0004122111131437123, "learning_rate": 2.780936454849499e-06, "loss": 0.0, "step": 133700 }, { "epoch": 13.38, "grad_norm": 0.0010729500791057944, "learning_rate": 2.779264214046823e-06, "loss": 0.0, "step": 133800 }, { "epoch": 13.39, "grad_norm": 2.4340455638593994e-05, "learning_rate": 2.7775919732441474e-06, "loss": 0.0, "step": 133900 }, { "epoch": 13.4, "grad_norm": 2.9749480745522305e-05, "learning_rate": 2.7759197324414714e-06, "loss": 0.0003, "step": 134000 }, { "epoch": 13.41, "grad_norm": 0.00011808306589955464, "learning_rate": 2.7742474916387963e-06, "loss": 0.0, "step": 134100 }, { "epoch": 13.42, "grad_norm": 9.720781235955656e-06, "learning_rate": 2.7725752508361208e-06, "loss": 0.0034, "step": 134200 }, { "epoch": 13.43, "grad_norm": 0.08499442040920258, "learning_rate": 2.770903010033445e-06, "loss": 0.0, "step": 134300 }, { "epoch": 13.44, "grad_norm": 6.802148709539324e-05, "learning_rate": 2.7692307692307697e-06, "loss": 0.0, "step": 134400 }, { "epoch": 13.45, "grad_norm": 1.7490610844106413e-05, "learning_rate": 2.7675585284280937e-06, "loss": 0.0, "step": 134500 }, { "epoch": 13.46, "grad_norm": 1.0484201993676834e-05, "learning_rate": 2.765886287625418e-06, "loss": 0.0, "step": 134600 }, { "epoch": 13.47, "grad_norm": 1.108324977394659e-05, "learning_rate": 2.764214046822743e-06, "loss": 0.0, "step": 134700 }, { "epoch": 13.48, "grad_norm": 0.00023472192697227, "learning_rate": 2.762541806020067e-06, "loss": 0.0028, "step": 134800 }, { "epoch": 13.49, "grad_norm": 2.1847616153536364e-05, "learning_rate": 2.7608695652173916e-06, "loss": 0.0032, "step": 134900 }, { "epoch": 13.5, "grad_norm": 1.1873323273903225e-05, "learning_rate": 2.7591973244147156e-06, "loss": 0.0041, "step": 135000 }, { "epoch": 13.5, "eval_accuracy": 0.9864, "eval_f1": 0.9864, "eval_loss": 0.13605718314647675, "eval_runtime": 136.6144, "eval_samples_per_second": 292.795, "eval_steps_per_second": 292.795, "step": 135000 }, { "epoch": 13.51, "grad_norm": 6.469953223131597e-05, "learning_rate": 2.7575250836120405e-06, "loss": 0.0056, "step": 135100 }, { "epoch": 13.52, "grad_norm": 1.1132638064736966e-05, "learning_rate": 2.755852842809365e-06, "loss": 0.0, "step": 135200 }, { "epoch": 13.53, "grad_norm": 1.0743293387349695e-05, "learning_rate": 2.754180602006689e-06, "loss": 0.0022, "step": 135300 }, { "epoch": 13.54, "grad_norm": 1.1827138223452494e-05, "learning_rate": 2.752508361204014e-06, "loss": 0.0038, "step": 135400 }, { "epoch": 13.55, "grad_norm": 1.2733278708765283e-05, "learning_rate": 2.750836120401338e-06, "loss": 0.0037, "step": 135500 }, { "epoch": 13.56, "grad_norm": 1.2386645721562672e-05, "learning_rate": 2.7491638795986624e-06, "loss": 0.0, "step": 135600 }, { "epoch": 13.57, "grad_norm": 8.410525879298802e-06, "learning_rate": 2.7474916387959864e-06, "loss": 0.0, "step": 135700 }, { "epoch": 13.58, "grad_norm": 4.612410339177586e-05, "learning_rate": 2.7458193979933113e-06, "loss": 0.0, "step": 135800 }, { "epoch": 13.59, "grad_norm": 2.6822170184459537e-05, "learning_rate": 2.7441471571906358e-06, "loss": 0.0, "step": 135900 }, { "epoch": 13.6, "grad_norm": 1.6195193893508986e-05, "learning_rate": 2.74247491638796e-06, "loss": 0.0, "step": 136000 }, { "epoch": 13.61, "grad_norm": 1.1398445167287719e-05, "learning_rate": 2.7408026755852847e-06, "loss": 0.0, "step": 136100 }, { "epoch": 13.62, "grad_norm": 7.909869964350946e-06, "learning_rate": 2.7391304347826087e-06, "loss": 0.0047, "step": 136200 }, { "epoch": 13.63, "grad_norm": 8.60071577335475e-06, "learning_rate": 2.737458193979933e-06, "loss": 0.0002, "step": 136300 }, { "epoch": 13.64, "grad_norm": 1.0402597581560258e-05, "learning_rate": 2.735785953177258e-06, "loss": 0.0036, "step": 136400 }, { "epoch": 13.65, "grad_norm": 9.145016520051286e-06, "learning_rate": 2.734113712374582e-06, "loss": 0.0024, "step": 136500 }, { "epoch": 13.66, "grad_norm": 1.3457996828947216e-05, "learning_rate": 2.7324414715719066e-06, "loss": 0.004, "step": 136600 }, { "epoch": 13.67, "grad_norm": 1.3402839613263495e-05, "learning_rate": 2.7307692307692306e-06, "loss": 0.0, "step": 136700 }, { "epoch": 13.68, "grad_norm": 1.228326709679095e-05, "learning_rate": 2.7290969899665555e-06, "loss": 0.0079, "step": 136800 }, { "epoch": 13.69, "grad_norm": 0.00012510253873188049, "learning_rate": 2.72742474916388e-06, "loss": 0.0026, "step": 136900 }, { "epoch": 13.7, "grad_norm": 2.1446710888994858e-05, "learning_rate": 2.725752508361204e-06, "loss": 0.0, "step": 137000 }, { "epoch": 13.71, "grad_norm": 2.1957304852548987e-05, "learning_rate": 2.724080267558529e-06, "loss": 0.0, "step": 137100 }, { "epoch": 13.72, "grad_norm": 1.3254529221740086e-05, "learning_rate": 2.722408026755853e-06, "loss": 0.0088, "step": 137200 }, { "epoch": 13.73, "grad_norm": 2.5398878278792836e-05, "learning_rate": 2.7207357859531774e-06, "loss": 0.0047, "step": 137300 }, { "epoch": 13.74, "grad_norm": 3.166155147482641e-05, "learning_rate": 2.7190635451505022e-06, "loss": 0.017, "step": 137400 }, { "epoch": 13.75, "grad_norm": 0.0029113488271832466, "learning_rate": 2.7173913043478263e-06, "loss": 0.0, "step": 137500 }, { "epoch": 13.76, "grad_norm": 1.447906470275484e-05, "learning_rate": 2.7157190635451507e-06, "loss": 0.0002, "step": 137600 }, { "epoch": 13.77, "grad_norm": 0.00028520452906377614, "learning_rate": 2.7140468227424748e-06, "loss": 0.0, "step": 137700 }, { "epoch": 13.78, "grad_norm": 0.00032373316935263574, "learning_rate": 2.7123745819397997e-06, "loss": 0.0, "step": 137800 }, { "epoch": 13.79, "grad_norm": 1.5336469004978426e-05, "learning_rate": 2.710702341137124e-06, "loss": 0.0048, "step": 137900 }, { "epoch": 13.8, "grad_norm": 4.3872540118172765e-05, "learning_rate": 2.709030100334448e-06, "loss": 0.0035, "step": 138000 }, { "epoch": 13.81, "grad_norm": 3.1709718314232305e-05, "learning_rate": 2.707357859531773e-06, "loss": 0.0001, "step": 138100 }, { "epoch": 13.82, "grad_norm": 3.154708610964008e-05, "learning_rate": 2.705685618729097e-06, "loss": 0.0039, "step": 138200 }, { "epoch": 13.83, "grad_norm": 0.0019509652629494667, "learning_rate": 2.7040133779264215e-06, "loss": 0.0101, "step": 138300 }, { "epoch": 13.84, "grad_norm": 3.1300398404709995e-05, "learning_rate": 2.7023411371237464e-06, "loss": 0.0033, "step": 138400 }, { "epoch": 13.85, "grad_norm": 0.02786121517419815, "learning_rate": 2.7006688963210705e-06, "loss": 0.0025, "step": 138500 }, { "epoch": 13.86, "grad_norm": 0.01319398358464241, "learning_rate": 2.698996655518395e-06, "loss": 0.0, "step": 138600 }, { "epoch": 13.87, "grad_norm": 3.6790541344089434e-05, "learning_rate": 2.697324414715719e-06, "loss": 0.0, "step": 138700 }, { "epoch": 13.88, "grad_norm": 0.0003256149939261377, "learning_rate": 2.695652173913044e-06, "loss": 0.0009, "step": 138800 }, { "epoch": 13.89, "grad_norm": 0.0008701249025762081, "learning_rate": 2.6939799331103683e-06, "loss": 0.0057, "step": 138900 }, { "epoch": 13.9, "grad_norm": 0.00016028298705350608, "learning_rate": 2.6923076923076923e-06, "loss": 0.0044, "step": 139000 }, { "epoch": 13.91, "grad_norm": 4.8700254410505295e-05, "learning_rate": 2.6906354515050172e-06, "loss": 0.0037, "step": 139100 }, { "epoch": 13.92, "grad_norm": 1.1900821846211329e-05, "learning_rate": 2.6889632107023413e-06, "loss": 0.0, "step": 139200 }, { "epoch": 13.93, "grad_norm": 3.17985650326591e-05, "learning_rate": 2.6872909698996657e-06, "loss": 0.0, "step": 139300 }, { "epoch": 13.94, "grad_norm": 1.7800852219806984e-05, "learning_rate": 2.6856187290969906e-06, "loss": 0.0, "step": 139400 }, { "epoch": 13.95, "grad_norm": 0.00012519603478722274, "learning_rate": 2.6839464882943146e-06, "loss": 0.0, "step": 139500 }, { "epoch": 13.96, "grad_norm": 1.7471717001171783e-05, "learning_rate": 2.682274247491639e-06, "loss": 0.0, "step": 139600 }, { "epoch": 13.97, "grad_norm": 2.409886656096205e-05, "learning_rate": 2.680602006688963e-06, "loss": 0.0, "step": 139700 }, { "epoch": 13.98, "grad_norm": 9.131293336395174e-05, "learning_rate": 2.678929765886288e-06, "loss": 0.0, "step": 139800 }, { "epoch": 13.99, "grad_norm": 0.00039141171146184206, "learning_rate": 2.6772575250836125e-06, "loss": 0.0017, "step": 139900 }, { "epoch": 14.0, "grad_norm": 0.00036528526106849313, "learning_rate": 2.6755852842809365e-06, "loss": 0.005, "step": 140000 }, { "epoch": 14.0, "eval_accuracy": 0.987025, "eval_f1": 0.987025, "eval_loss": 0.12981809675693512, "eval_runtime": 138.4883, "eval_samples_per_second": 288.833, "eval_steps_per_second": 288.833, "step": 140000 }, { "epoch": 14.01, "grad_norm": 1.970234006876126e-05, "learning_rate": 2.6739130434782614e-06, "loss": 0.0, "step": 140100 }, { "epoch": 14.02, "grad_norm": 4.8702968342695385e-05, "learning_rate": 2.6722408026755854e-06, "loss": 0.003, "step": 140200 }, { "epoch": 14.03, "grad_norm": 1.3415955436357763e-05, "learning_rate": 2.67056856187291e-06, "loss": 0.0051, "step": 140300 }, { "epoch": 14.04, "grad_norm": 9.642380064178724e-06, "learning_rate": 2.6688963210702344e-06, "loss": 0.0, "step": 140400 }, { "epoch": 14.05, "grad_norm": 1.1794251804531086e-05, "learning_rate": 2.667224080267559e-06, "loss": 0.0, "step": 140500 }, { "epoch": 14.06, "grad_norm": 1.1837163583550137e-05, "learning_rate": 2.6655518394648833e-06, "loss": 0.0, "step": 140600 }, { "epoch": 14.07, "grad_norm": 0.00018968567019328475, "learning_rate": 2.6638795986622073e-06, "loss": 0.0, "step": 140700 }, { "epoch": 14.08, "grad_norm": 2.6006642656284384e-05, "learning_rate": 2.662207357859532e-06, "loss": 0.0, "step": 140800 }, { "epoch": 14.09, "grad_norm": 1.6532772860955447e-05, "learning_rate": 2.6605351170568567e-06, "loss": 0.0, "step": 140900 }, { "epoch": 14.1, "grad_norm": 3.34562428179197e-05, "learning_rate": 2.6588628762541807e-06, "loss": 0.003, "step": 141000 }, { "epoch": 14.11, "grad_norm": 0.00025544603704474866, "learning_rate": 2.657190635451505e-06, "loss": 0.0048, "step": 141100 }, { "epoch": 14.12, "grad_norm": 1.0975451914418954e-05, "learning_rate": 2.6555183946488296e-06, "loss": 0.0, "step": 141200 }, { "epoch": 14.13, "grad_norm": 5.2620616770582274e-05, "learning_rate": 2.653846153846154e-06, "loss": 0.0072, "step": 141300 }, { "epoch": 14.14, "grad_norm": 9.236534424417187e-06, "learning_rate": 2.6521739130434785e-06, "loss": 0.0, "step": 141400 }, { "epoch": 14.15, "grad_norm": 1.7573007426108234e-05, "learning_rate": 2.650501672240803e-06, "loss": 0.0, "step": 141500 }, { "epoch": 14.16, "grad_norm": 1.277837418456329e-05, "learning_rate": 2.6488294314381275e-06, "loss": 0.007, "step": 141600 }, { "epoch": 14.17, "grad_norm": 1.1337443538650405e-05, "learning_rate": 2.6471571906354515e-06, "loss": 0.0, "step": 141700 }, { "epoch": 14.18, "grad_norm": 2.1818048480781727e-05, "learning_rate": 2.645484949832776e-06, "loss": 0.0, "step": 141800 }, { "epoch": 14.19, "grad_norm": 2.7288138880976476e-05, "learning_rate": 2.643812709030101e-06, "loss": 0.0018, "step": 141900 }, { "epoch": 14.2, "grad_norm": 1.165820867754519e-05, "learning_rate": 2.642140468227425e-06, "loss": 0.0028, "step": 142000 }, { "epoch": 14.21, "grad_norm": 9.926073289534543e-06, "learning_rate": 2.6404682274247493e-06, "loss": 0.0047, "step": 142100 }, { "epoch": 14.22, "grad_norm": 1.514962968940381e-05, "learning_rate": 2.638795986622074e-06, "loss": 0.0034, "step": 142200 }, { "epoch": 14.23, "grad_norm": 8.196759154088795e-06, "learning_rate": 2.6371237458193983e-06, "loss": 0.0, "step": 142300 }, { "epoch": 14.24, "grad_norm": 2.0415285689523444e-05, "learning_rate": 2.6354515050167227e-06, "loss": 0.0, "step": 142400 }, { "epoch": 14.25, "grad_norm": 0.00016611946921329945, "learning_rate": 2.6337792642140468e-06, "loss": 0.0005, "step": 142500 }, { "epoch": 14.26, "grad_norm": 6.390087946783751e-06, "learning_rate": 2.6321070234113716e-06, "loss": 0.0001, "step": 142600 }, { "epoch": 14.27, "grad_norm": 1.526228697912302e-05, "learning_rate": 2.6304347826086957e-06, "loss": 0.0105, "step": 142700 }, { "epoch": 14.28, "grad_norm": 2.9013786843279377e-05, "learning_rate": 2.62876254180602e-06, "loss": 0.0, "step": 142800 }, { "epoch": 14.29, "grad_norm": 5.3516596381086856e-05, "learning_rate": 2.627090301003345e-06, "loss": 0.0032, "step": 142900 }, { "epoch": 14.3, "grad_norm": 1.080665515473811e-05, "learning_rate": 2.625418060200669e-06, "loss": 0.0049, "step": 143000 }, { "epoch": 14.31, "grad_norm": 1.2666178918152582e-05, "learning_rate": 2.6237458193979935e-06, "loss": 0.0, "step": 143100 }, { "epoch": 14.32, "grad_norm": 3.2697953429305926e-05, "learning_rate": 2.6220735785953176e-06, "loss": 0.0, "step": 143200 }, { "epoch": 14.33, "grad_norm": 3.8729096559109166e-05, "learning_rate": 2.6204013377926424e-06, "loss": 0.0, "step": 143300 }, { "epoch": 14.34, "grad_norm": 7.637468115717638e-06, "learning_rate": 2.618729096989967e-06, "loss": 0.0016, "step": 143400 }, { "epoch": 14.35, "grad_norm": 0.000350749003700912, "learning_rate": 2.617056856187291e-06, "loss": 0.0, "step": 143500 }, { "epoch": 14.36, "grad_norm": 6.134332579676993e-06, "learning_rate": 2.615384615384616e-06, "loss": 0.0, "step": 143600 }, { "epoch": 14.37, "grad_norm": 6.324955393210985e-06, "learning_rate": 2.61371237458194e-06, "loss": 0.0009, "step": 143700 }, { "epoch": 14.38, "grad_norm": 4.8610727390041575e-06, "learning_rate": 2.6120401337792643e-06, "loss": 0.0, "step": 143800 }, { "epoch": 14.39, "grad_norm": 8.644431363791227e-05, "learning_rate": 2.610367892976589e-06, "loss": 0.0002, "step": 143900 }, { "epoch": 14.4, "grad_norm": 7.85917291068472e-06, "learning_rate": 2.6086956521739132e-06, "loss": 0.0059, "step": 144000 }, { "epoch": 14.41, "grad_norm": 1.2011563740088604e-05, "learning_rate": 2.6070234113712377e-06, "loss": 0.0042, "step": 144100 }, { "epoch": 14.42, "grad_norm": 4.436729068402201e-05, "learning_rate": 2.6053511705685617e-06, "loss": 0.0017, "step": 144200 }, { "epoch": 14.43, "grad_norm": 9.664974641054869e-06, "learning_rate": 2.6036789297658866e-06, "loss": 0.0, "step": 144300 }, { "epoch": 14.44, "grad_norm": 6.7011196733801626e-06, "learning_rate": 2.602006688963211e-06, "loss": 0.0, "step": 144400 }, { "epoch": 14.45, "grad_norm": 3.2653235393809155e-05, "learning_rate": 2.600334448160535e-06, "loss": 0.0, "step": 144500 }, { "epoch": 14.46, "grad_norm": 8.305092705995776e-06, "learning_rate": 2.59866220735786e-06, "loss": 0.005, "step": 144600 }, { "epoch": 14.47, "grad_norm": 4.4582277041627094e-05, "learning_rate": 2.596989966555184e-06, "loss": 0.0011, "step": 144700 }, { "epoch": 14.48, "grad_norm": 9.818021680985112e-06, "learning_rate": 2.5953177257525085e-06, "loss": 0.0, "step": 144800 }, { "epoch": 14.49, "grad_norm": 4.905187597614713e-05, "learning_rate": 2.5936454849498334e-06, "loss": 0.0, "step": 144900 }, { "epoch": 14.5, "grad_norm": 3.9814804040361196e-05, "learning_rate": 2.5919732441471574e-06, "loss": 0.0, "step": 145000 }, { "epoch": 14.5, "eval_accuracy": 0.98695, "eval_f1": 0.98695, "eval_loss": 0.13318751752376556, "eval_runtime": 136.0623, "eval_samples_per_second": 293.983, "eval_steps_per_second": 293.983, "step": 145000 }, { "epoch": 14.51, "grad_norm": 3.010919681400992e-05, "learning_rate": 2.590301003344482e-06, "loss": 0.0017, "step": 145100 }, { "epoch": 14.52, "grad_norm": 1.9201515897293575e-05, "learning_rate": 2.588628762541806e-06, "loss": 0.0, "step": 145200 }, { "epoch": 14.53, "grad_norm": 6.911823220434599e-06, "learning_rate": 2.586956521739131e-06, "loss": 0.0, "step": 145300 }, { "epoch": 14.54, "grad_norm": 0.00012770664761774242, "learning_rate": 2.5852842809364553e-06, "loss": 0.0, "step": 145400 }, { "epoch": 14.55, "grad_norm": 0.004662015475332737, "learning_rate": 2.5836120401337793e-06, "loss": 0.0, "step": 145500 }, { "epoch": 14.56, "grad_norm": 0.0038990043103694916, "learning_rate": 2.581939799331104e-06, "loss": 0.0046, "step": 145600 }, { "epoch": 14.57, "grad_norm": 1.2492128007579595e-05, "learning_rate": 2.5802675585284282e-06, "loss": 0.0, "step": 145700 }, { "epoch": 14.58, "grad_norm": 6.336121259664651e-06, "learning_rate": 2.5785953177257527e-06, "loss": 0.0, "step": 145800 }, { "epoch": 14.59, "grad_norm": 7.761910637782421e-06, "learning_rate": 2.5769230769230767e-06, "loss": 0.0, "step": 145900 }, { "epoch": 14.6, "grad_norm": 0.0003560628683771938, "learning_rate": 2.5752508361204016e-06, "loss": 0.0, "step": 146000 }, { "epoch": 14.61, "grad_norm": 9.447974662180059e-06, "learning_rate": 2.573578595317726e-06, "loss": 0.0, "step": 146100 }, { "epoch": 14.62, "grad_norm": 6.381172715919092e-05, "learning_rate": 2.57190635451505e-06, "loss": 0.0039, "step": 146200 }, { "epoch": 14.63, "grad_norm": 1.7973427020478994e-05, "learning_rate": 2.570234113712375e-06, "loss": 0.0053, "step": 146300 }, { "epoch": 14.64, "grad_norm": 8.584712304582354e-06, "learning_rate": 2.568561872909699e-06, "loss": 0.0101, "step": 146400 }, { "epoch": 14.65, "grad_norm": 1.8466884284862317e-05, "learning_rate": 2.5668896321070235e-06, "loss": 0.0, "step": 146500 }, { "epoch": 14.66, "grad_norm": 7.304724022105802e-06, "learning_rate": 2.5652173913043484e-06, "loss": 0.0, "step": 146600 }, { "epoch": 14.67, "grad_norm": 6.799628863518592e-06, "learning_rate": 2.5635451505016724e-06, "loss": 0.0, "step": 146700 }, { "epoch": 14.68, "grad_norm": 9.02667215996189e-06, "learning_rate": 2.561872909698997e-06, "loss": 0.0, "step": 146800 }, { "epoch": 14.69, "grad_norm": 4.482056829147041e-05, "learning_rate": 2.560200668896321e-06, "loss": 0.0, "step": 146900 }, { "epoch": 14.7, "grad_norm": 8.890767276170664e-06, "learning_rate": 2.5585284280936458e-06, "loss": 0.0033, "step": 147000 }, { "epoch": 14.71, "grad_norm": 1.5810957847861573e-05, "learning_rate": 2.5568561872909702e-06, "loss": 0.0039, "step": 147100 }, { "epoch": 14.72, "grad_norm": 8.200189040508121e-05, "learning_rate": 2.5551839464882943e-06, "loss": 0.0032, "step": 147200 }, { "epoch": 14.73, "grad_norm": 1.996455466724001e-05, "learning_rate": 2.553511705685619e-06, "loss": 0.0, "step": 147300 }, { "epoch": 14.74, "grad_norm": 1.1589927453314885e-05, "learning_rate": 2.551839464882943e-06, "loss": 0.0, "step": 147400 }, { "epoch": 14.75, "grad_norm": 8.36480176076293e-05, "learning_rate": 2.5501672240802677e-06, "loss": 0.0, "step": 147500 }, { "epoch": 14.76, "grad_norm": 1.1121761417598464e-05, "learning_rate": 2.5484949832775925e-06, "loss": 0.0, "step": 147600 }, { "epoch": 14.77, "grad_norm": 9.952851542038843e-06, "learning_rate": 2.5468227424749166e-06, "loss": 0.0005, "step": 147700 }, { "epoch": 14.78, "grad_norm": 0.0005833844188600779, "learning_rate": 2.545150501672241e-06, "loss": 0.0036, "step": 147800 }, { "epoch": 14.79, "grad_norm": 0.000579743180423975, "learning_rate": 2.543478260869565e-06, "loss": 0.0, "step": 147900 }, { "epoch": 14.8, "grad_norm": 0.007290335837751627, "learning_rate": 2.54180602006689e-06, "loss": 0.0, "step": 148000 }, { "epoch": 14.81, "grad_norm": 7.68768040870782e-06, "learning_rate": 2.5401337792642144e-06, "loss": 0.0032, "step": 148100 }, { "epoch": 14.82, "grad_norm": 1.0321867193852086e-05, "learning_rate": 2.5384615384615385e-06, "loss": 0.0, "step": 148200 }, { "epoch": 14.83, "grad_norm": 5.90362378716236e-06, "learning_rate": 2.5367892976588633e-06, "loss": 0.0, "step": 148300 }, { "epoch": 14.84, "grad_norm": 6.799234597565373e-06, "learning_rate": 2.5351170568561874e-06, "loss": 0.0, "step": 148400 }, { "epoch": 14.85, "grad_norm": 7.403478321066359e-06, "learning_rate": 2.533444816053512e-06, "loss": 0.0045, "step": 148500 }, { "epoch": 14.86, "grad_norm": 1.4559896953869611e-05, "learning_rate": 2.5317725752508367e-06, "loss": 0.0018, "step": 148600 }, { "epoch": 14.87, "grad_norm": 6.7038836277788505e-06, "learning_rate": 2.5301003344481608e-06, "loss": 0.0, "step": 148700 }, { "epoch": 14.88, "grad_norm": 6.24095082457643e-06, "learning_rate": 2.5284280936454852e-06, "loss": 0.0, "step": 148800 }, { "epoch": 14.89, "grad_norm": 6.512796971946955e-06, "learning_rate": 2.5267558528428093e-06, "loss": 0.0079, "step": 148900 }, { "epoch": 14.9, "grad_norm": 0.0056665875017642975, "learning_rate": 2.525083612040134e-06, "loss": 0.0065, "step": 149000 }, { "epoch": 14.91, "grad_norm": 9.204671187035274e-06, "learning_rate": 2.5234113712374586e-06, "loss": 0.0062, "step": 149100 }, { "epoch": 14.92, "grad_norm": 1.0256294444843661e-05, "learning_rate": 2.5217391304347826e-06, "loss": 0.0003, "step": 149200 }, { "epoch": 14.93, "grad_norm": 0.0003551799163687974, "learning_rate": 2.5200668896321075e-06, "loss": 0.0032, "step": 149300 }, { "epoch": 14.94, "grad_norm": 0.00010754630056908354, "learning_rate": 2.5183946488294316e-06, "loss": 0.0001, "step": 149400 }, { "epoch": 14.95, "grad_norm": 8.264900679932907e-05, "learning_rate": 2.516722408026756e-06, "loss": 0.0, "step": 149500 }, { "epoch": 14.96, "grad_norm": 1.0027822099800687e-05, "learning_rate": 2.5150501672240805e-06, "loss": 0.0011, "step": 149600 }, { "epoch": 14.97, "grad_norm": 0.0418342724442482, "learning_rate": 2.513377926421405e-06, "loss": 0.0, "step": 149700 }, { "epoch": 14.98, "grad_norm": 8.36113395052962e-06, "learning_rate": 2.5117056856187294e-06, "loss": 0.0059, "step": 149800 }, { "epoch": 14.99, "grad_norm": 1.0185762221226469e-05, "learning_rate": 2.5100334448160534e-06, "loss": 0.004, "step": 149900 }, { "epoch": 15.0, "grad_norm": 4.18316267314367e-05, "learning_rate": 2.5083612040133783e-06, "loss": 0.0, "step": 150000 }, { "epoch": 15.0, "eval_accuracy": 0.9869, "eval_f1": 0.9869, "eval_loss": 0.13588353991508484, "eval_runtime": 135.008, "eval_samples_per_second": 296.279, "eval_steps_per_second": 296.279, "step": 150000 }, { "epoch": 15.01, "grad_norm": 2.9284803531481884e-05, "learning_rate": 2.5066889632107028e-06, "loss": 0.0014, "step": 150100 }, { "epoch": 15.02, "grad_norm": 2.252110789413564e-05, "learning_rate": 2.505016722408027e-06, "loss": 0.0037, "step": 150200 }, { "epoch": 15.03, "grad_norm": 7.923205885163043e-06, "learning_rate": 2.5033444816053513e-06, "loss": 0.0006, "step": 150300 }, { "epoch": 15.04, "grad_norm": 6.102809038566193e-06, "learning_rate": 2.5016722408026757e-06, "loss": 0.0064, "step": 150400 }, { "epoch": 15.05, "grad_norm": 6.924793979123933e-06, "learning_rate": 2.5e-06, "loss": 0.0, "step": 150500 }, { "epoch": 15.06, "grad_norm": 0.00029880605870857835, "learning_rate": 2.4983277591973247e-06, "loss": 0.0, "step": 150600 }, { "epoch": 15.07, "grad_norm": 5.71871669308166e-06, "learning_rate": 2.496655518394649e-06, "loss": 0.003, "step": 150700 }, { "epoch": 15.08, "grad_norm": 7.58796022637398e-06, "learning_rate": 2.4949832775919736e-06, "loss": 0.0, "step": 150800 }, { "epoch": 15.09, "grad_norm": 0.00011463899136288092, "learning_rate": 2.493311036789298e-06, "loss": 0.0011, "step": 150900 }, { "epoch": 15.1, "grad_norm": 9.345097168989014e-06, "learning_rate": 2.491638795986622e-06, "loss": 0.0, "step": 151000 }, { "epoch": 15.11, "grad_norm": 0.2711752653121948, "learning_rate": 2.4899665551839465e-06, "loss": 0.0, "step": 151100 }, { "epoch": 15.12, "grad_norm": 7.29593557480257e-06, "learning_rate": 2.488294314381271e-06, "loss": 0.0, "step": 151200 }, { "epoch": 15.13, "grad_norm": 9.790243893803563e-06, "learning_rate": 2.4866220735785955e-06, "loss": 0.0, "step": 151300 }, { "epoch": 15.14, "grad_norm": 2.9339826141949743e-05, "learning_rate": 2.48494983277592e-06, "loss": 0.0, "step": 151400 }, { "epoch": 15.15, "grad_norm": 1.5411867934744805e-05, "learning_rate": 2.4832775919732444e-06, "loss": 0.0002, "step": 151500 }, { "epoch": 15.16, "grad_norm": 5.220775165071245e-06, "learning_rate": 2.481605351170569e-06, "loss": 0.0, "step": 151600 }, { "epoch": 15.17, "grad_norm": 4.955260919814464e-06, "learning_rate": 2.479933110367893e-06, "loss": 0.0, "step": 151700 }, { "epoch": 15.18, "grad_norm": 4.50190918854787e-06, "learning_rate": 2.4782608695652178e-06, "loss": 0.0023, "step": 151800 }, { "epoch": 15.19, "grad_norm": 6.239830327103846e-06, "learning_rate": 2.4765886287625422e-06, "loss": 0.0, "step": 151900 }, { "epoch": 15.2, "grad_norm": 0.0002775177708826959, "learning_rate": 2.4749163879598663e-06, "loss": 0.0035, "step": 152000 }, { "epoch": 15.21, "grad_norm": 0.0001772949326550588, "learning_rate": 2.4732441471571907e-06, "loss": 0.0001, "step": 152100 }, { "epoch": 15.22, "grad_norm": 0.0001343058975180611, "learning_rate": 2.471571906354515e-06, "loss": 0.0035, "step": 152200 }, { "epoch": 15.23, "grad_norm": 4.652582447306486e-06, "learning_rate": 2.4698996655518396e-06, "loss": 0.0, "step": 152300 }, { "epoch": 15.24, "grad_norm": 4.857491603615927e-06, "learning_rate": 2.468227424749164e-06, "loss": 0.0, "step": 152400 }, { "epoch": 15.25, "grad_norm": 5.226095254329266e-06, "learning_rate": 2.4665551839464886e-06, "loss": 0.0, "step": 152500 }, { "epoch": 15.26, "grad_norm": 6.556086646014592e-06, "learning_rate": 2.464882943143813e-06, "loss": 0.0029, "step": 152600 }, { "epoch": 15.27, "grad_norm": 7.299048320419388e-06, "learning_rate": 2.463210702341137e-06, "loss": 0.0, "step": 152700 }, { "epoch": 15.28, "grad_norm": 6.213837878021877e-06, "learning_rate": 2.461538461538462e-06, "loss": 0.0, "step": 152800 }, { "epoch": 15.29, "grad_norm": 0.0003703912952914834, "learning_rate": 2.4598662207357864e-06, "loss": 0.0, "step": 152900 }, { "epoch": 15.3, "grad_norm": 3.938354893762153e-06, "learning_rate": 2.4581939799331104e-06, "loss": 0.0, "step": 153000 }, { "epoch": 15.31, "grad_norm": 1.6065227100625634e-05, "learning_rate": 2.456521739130435e-06, "loss": 0.0, "step": 153100 }, { "epoch": 15.32, "grad_norm": 1.110832454287447e-05, "learning_rate": 2.4548494983277594e-06, "loss": 0.0054, "step": 153200 }, { "epoch": 15.33, "grad_norm": 7.1401191235054284e-06, "learning_rate": 2.453177257525084e-06, "loss": 0.0032, "step": 153300 }, { "epoch": 15.34, "grad_norm": 6.627889433730161e-06, "learning_rate": 2.4515050167224083e-06, "loss": 0.0044, "step": 153400 }, { "epoch": 15.35, "grad_norm": 0.05635225027799606, "learning_rate": 2.4498327759197327e-06, "loss": 0.0051, "step": 153500 }, { "epoch": 15.36, "grad_norm": 4.67062136522145e-06, "learning_rate": 2.448160535117057e-06, "loss": 0.0009, "step": 153600 }, { "epoch": 15.37, "grad_norm": 0.0002716428425628692, "learning_rate": 2.4464882943143812e-06, "loss": 0.0068, "step": 153700 }, { "epoch": 15.38, "grad_norm": 3.66365748050157e-05, "learning_rate": 2.4448160535117057e-06, "loss": 0.0042, "step": 153800 }, { "epoch": 15.39, "grad_norm": 1.4453570656769443e-05, "learning_rate": 2.4431438127090306e-06, "loss": 0.0025, "step": 153900 }, { "epoch": 15.4, "grad_norm": 0.0001801799953682348, "learning_rate": 2.4414715719063546e-06, "loss": 0.0, "step": 154000 }, { "epoch": 15.41, "grad_norm": 7.897392606537323e-06, "learning_rate": 2.439799331103679e-06, "loss": 0.0082, "step": 154100 }, { "epoch": 15.42, "grad_norm": 0.0011347578838467598, "learning_rate": 2.4381270903010035e-06, "loss": 0.0012, "step": 154200 }, { "epoch": 15.43, "grad_norm": 8.59985357237747e-06, "learning_rate": 2.436454849498328e-06, "loss": 0.0, "step": 154300 }, { "epoch": 15.44, "grad_norm": 1.318647809966933e-05, "learning_rate": 2.4347826086956525e-06, "loss": 0.0, "step": 154400 }, { "epoch": 15.45, "grad_norm": 1.7026190107571892e-05, "learning_rate": 2.433110367892977e-06, "loss": 0.0008, "step": 154500 }, { "epoch": 15.46, "grad_norm": 5.7994143389805686e-06, "learning_rate": 2.4314381270903014e-06, "loss": 0.0039, "step": 154600 }, { "epoch": 15.47, "grad_norm": 5.057373982708668e-06, "learning_rate": 2.4297658862876254e-06, "loss": 0.0, "step": 154700 }, { "epoch": 15.48, "grad_norm": 5.878926458535716e-06, "learning_rate": 2.42809364548495e-06, "loss": 0.0, "step": 154800 }, { "epoch": 15.49, "grad_norm": 5.919266641285503e-06, "learning_rate": 2.4264214046822743e-06, "loss": 0.0, "step": 154900 }, { "epoch": 15.5, "grad_norm": 1.6888472600840032e-05, "learning_rate": 2.424749163879599e-06, "loss": 0.0, "step": 155000 }, { "epoch": 15.5, "eval_accuracy": 0.9874, "eval_f1": 0.9874, "eval_loss": 0.13571089506149292, "eval_runtime": 133.2814, "eval_samples_per_second": 300.117, "eval_steps_per_second": 300.117, "step": 155000 }, { "epoch": 15.51, "grad_norm": 0.0019413414411246777, "learning_rate": 2.4230769230769233e-06, "loss": 0.0043, "step": 155100 }, { "epoch": 15.52, "grad_norm": 2.8746006137225777e-05, "learning_rate": 2.4214046822742477e-06, "loss": 0.0, "step": 155200 }, { "epoch": 15.53, "grad_norm": 6.713657057844102e-05, "learning_rate": 2.419732441471572e-06, "loss": 0.0, "step": 155300 }, { "epoch": 15.54, "grad_norm": 0.01398865319788456, "learning_rate": 2.4180602006688967e-06, "loss": 0.0, "step": 155400 }, { "epoch": 15.55, "grad_norm": 8.839925430947915e-05, "learning_rate": 2.416387959866221e-06, "loss": 0.0, "step": 155500 }, { "epoch": 15.56, "grad_norm": 4.814656676899176e-06, "learning_rate": 2.414715719063545e-06, "loss": 0.0005, "step": 155600 }, { "epoch": 15.57, "grad_norm": 2.2645757780992426e-05, "learning_rate": 2.4130434782608696e-06, "loss": 0.0, "step": 155700 }, { "epoch": 15.58, "grad_norm": 6.3812894950388e-06, "learning_rate": 2.411371237458194e-06, "loss": 0.0018, "step": 155800 }, { "epoch": 15.59, "grad_norm": 9.400127964909188e-06, "learning_rate": 2.4096989966555185e-06, "loss": 0.0075, "step": 155900 }, { "epoch": 15.6, "grad_norm": 0.000328985188389197, "learning_rate": 2.408026755852843e-06, "loss": 0.0105, "step": 156000 }, { "epoch": 15.61, "grad_norm": 0.0001703462185105309, "learning_rate": 2.4063545150501674e-06, "loss": 0.0, "step": 156100 }, { "epoch": 15.62, "grad_norm": 7.5086682045366615e-06, "learning_rate": 2.404682274247492e-06, "loss": 0.0, "step": 156200 }, { "epoch": 15.63, "grad_norm": 4.592197001329623e-06, "learning_rate": 2.403010033444816e-06, "loss": 0.0, "step": 156300 }, { "epoch": 15.64, "grad_norm": 0.0002146685728803277, "learning_rate": 2.401337792642141e-06, "loss": 0.0, "step": 156400 }, { "epoch": 15.65, "grad_norm": 1.4054951861908194e-05, "learning_rate": 2.3996655518394653e-06, "loss": 0.0018, "step": 156500 }, { "epoch": 15.66, "grad_norm": 8.72565942700021e-06, "learning_rate": 2.3979933110367893e-06, "loss": 0.0, "step": 156600 }, { "epoch": 15.67, "grad_norm": 7.5215334618405905e-06, "learning_rate": 2.396321070234114e-06, "loss": 0.0, "step": 156700 }, { "epoch": 15.68, "grad_norm": 1.3494937775249127e-05, "learning_rate": 2.3946488294314382e-06, "loss": 0.0043, "step": 156800 }, { "epoch": 15.69, "grad_norm": 0.00142685417085886, "learning_rate": 2.3929765886287627e-06, "loss": 0.0, "step": 156900 }, { "epoch": 15.7, "grad_norm": 1.0469170774740633e-05, "learning_rate": 2.391304347826087e-06, "loss": 0.0025, "step": 157000 }, { "epoch": 15.71, "grad_norm": 6.9538637035293505e-06, "learning_rate": 2.3896321070234116e-06, "loss": 0.0, "step": 157100 }, { "epoch": 15.72, "grad_norm": 0.00019743008306249976, "learning_rate": 2.387959866220736e-06, "loss": 0.0, "step": 157200 }, { "epoch": 15.73, "grad_norm": 5.979543402645504e-06, "learning_rate": 2.38628762541806e-06, "loss": 0.0, "step": 157300 }, { "epoch": 15.74, "grad_norm": 9.36076685320586e-06, "learning_rate": 2.384615384615385e-06, "loss": 0.0, "step": 157400 }, { "epoch": 15.75, "grad_norm": 4.156654085818445e-06, "learning_rate": 2.3829431438127095e-06, "loss": 0.0, "step": 157500 }, { "epoch": 15.76, "grad_norm": 0.0005622885655611753, "learning_rate": 2.3812709030100335e-06, "loss": 0.0045, "step": 157600 }, { "epoch": 15.77, "grad_norm": 1.6783771570771933e-05, "learning_rate": 2.379598662207358e-06, "loss": 0.0, "step": 157700 }, { "epoch": 15.78, "grad_norm": 2.4302729798364453e-05, "learning_rate": 2.3779264214046824e-06, "loss": 0.0, "step": 157800 }, { "epoch": 15.79, "grad_norm": 4.918417289445642e-06, "learning_rate": 2.376254180602007e-06, "loss": 0.0065, "step": 157900 }, { "epoch": 15.8, "grad_norm": 0.00043107534293085337, "learning_rate": 2.3745819397993314e-06, "loss": 0.002, "step": 158000 }, { "epoch": 15.81, "grad_norm": 6.5696453930286225e-06, "learning_rate": 2.372909698996656e-06, "loss": 0.0036, "step": 158100 }, { "epoch": 15.82, "grad_norm": 6.5576014094403945e-06, "learning_rate": 2.3712374581939803e-06, "loss": 0.0, "step": 158200 }, { "epoch": 15.83, "grad_norm": 6.10633696851437e-06, "learning_rate": 2.3695652173913043e-06, "loss": 0.0, "step": 158300 }, { "epoch": 15.84, "grad_norm": 2.4489367206115276e-05, "learning_rate": 2.3678929765886288e-06, "loss": 0.0, "step": 158400 }, { "epoch": 15.85, "grad_norm": 9.810268238652498e-06, "learning_rate": 2.3662207357859537e-06, "loss": 0.0, "step": 158500 }, { "epoch": 15.86, "grad_norm": 5.690500529453857e-06, "learning_rate": 2.3645484949832777e-06, "loss": 0.0, "step": 158600 }, { "epoch": 15.87, "grad_norm": 5.2947007134207524e-06, "learning_rate": 2.362876254180602e-06, "loss": 0.0, "step": 158700 }, { "epoch": 15.88, "grad_norm": 1.6920499547268264e-05, "learning_rate": 2.3612040133779266e-06, "loss": 0.0, "step": 158800 }, { "epoch": 15.89, "grad_norm": 6.31178954790812e-06, "learning_rate": 2.359531772575251e-06, "loss": 0.0, "step": 158900 }, { "epoch": 15.9, "grad_norm": 1.0661011401680298e-05, "learning_rate": 2.3578595317725755e-06, "loss": 0.0, "step": 159000 }, { "epoch": 15.91, "grad_norm": 2.5542381990817375e-05, "learning_rate": 2.3561872909699e-06, "loss": 0.0115, "step": 159100 }, { "epoch": 15.92, "grad_norm": 6.1712162278126925e-06, "learning_rate": 2.3545150501672245e-06, "loss": 0.0, "step": 159200 }, { "epoch": 15.93, "grad_norm": 0.0012592824641615152, "learning_rate": 2.3528428093645485e-06, "loss": 0.0, "step": 159300 }, { "epoch": 15.94, "grad_norm": 1.3988282262289431e-05, "learning_rate": 2.351170568561873e-06, "loss": 0.0, "step": 159400 }, { "epoch": 15.95, "grad_norm": 0.0015318294754251838, "learning_rate": 2.3494983277591974e-06, "loss": 0.0, "step": 159500 }, { "epoch": 15.96, "grad_norm": 1.7320944607490674e-05, "learning_rate": 2.347826086956522e-06, "loss": 0.0, "step": 159600 }, { "epoch": 15.97, "grad_norm": 6.387051598721882e-06, "learning_rate": 2.3461538461538463e-06, "loss": 0.0, "step": 159700 }, { "epoch": 15.98, "grad_norm": 0.0009833506774157286, "learning_rate": 2.344481605351171e-06, "loss": 0.0, "step": 159800 }, { "epoch": 15.99, "grad_norm": 0.005238015204668045, "learning_rate": 2.3428093645484953e-06, "loss": 0.0036, "step": 159900 }, { "epoch": 16.0, "grad_norm": 8.068822353379801e-06, "learning_rate": 2.3411371237458197e-06, "loss": 0.0018, "step": 160000 }, { "epoch": 16.0, "eval_accuracy": 0.986525, "eval_f1": 0.986525, "eval_loss": 0.1413874477148056, "eval_runtime": 134.4403, "eval_samples_per_second": 297.53, "eval_steps_per_second": 297.53, "step": 160000 }, { "epoch": 16.01, "grad_norm": 0.000655919371638447, "learning_rate": 2.339464882943144e-06, "loss": 0.0037, "step": 160100 }, { "epoch": 16.02, "grad_norm": 5.729572876589373e-06, "learning_rate": 2.337792642140468e-06, "loss": 0.0, "step": 160200 }, { "epoch": 16.03, "grad_norm": 3.707243740791455e-05, "learning_rate": 2.3361204013377927e-06, "loss": 0.0018, "step": 160300 }, { "epoch": 16.04, "grad_norm": 8.37804691400379e-05, "learning_rate": 2.334448160535117e-06, "loss": 0.0056, "step": 160400 }, { "epoch": 16.05, "grad_norm": 0.00018116727005690336, "learning_rate": 2.3327759197324416e-06, "loss": 0.0, "step": 160500 }, { "epoch": 16.06, "grad_norm": 5.471560143632814e-06, "learning_rate": 2.331103678929766e-06, "loss": 0.0, "step": 160600 }, { "epoch": 16.07, "grad_norm": 0.002342527499422431, "learning_rate": 2.3294314381270905e-06, "loss": 0.0, "step": 160700 }, { "epoch": 16.08, "grad_norm": 5.308319941832451e-06, "learning_rate": 2.327759197324415e-06, "loss": 0.0021, "step": 160800 }, { "epoch": 16.09, "grad_norm": 1.8882026779465377e-05, "learning_rate": 2.326086956521739e-06, "loss": 0.0, "step": 160900 }, { "epoch": 16.1, "grad_norm": 5.862941179657355e-06, "learning_rate": 2.324414715719064e-06, "loss": 0.0026, "step": 161000 }, { "epoch": 16.11, "grad_norm": 9.476354534854181e-06, "learning_rate": 2.3227424749163884e-06, "loss": 0.0, "step": 161100 }, { "epoch": 16.12, "grad_norm": 1.4225512131815776e-05, "learning_rate": 2.3210702341137124e-06, "loss": 0.0018, "step": 161200 }, { "epoch": 16.13, "grad_norm": 3.2810748962219805e-05, "learning_rate": 2.319397993311037e-06, "loss": 0.0024, "step": 161300 }, { "epoch": 16.14, "grad_norm": 0.006205264013260603, "learning_rate": 2.3177257525083613e-06, "loss": 0.0048, "step": 161400 }, { "epoch": 16.15, "grad_norm": 2.7496673283167183e-05, "learning_rate": 2.3160535117056858e-06, "loss": 0.0001, "step": 161500 }, { "epoch": 16.16, "grad_norm": 2.4235663659055717e-05, "learning_rate": 2.3143812709030102e-06, "loss": 0.003, "step": 161600 }, { "epoch": 16.17, "grad_norm": 4.839419489144348e-06, "learning_rate": 2.3127090301003347e-06, "loss": 0.0, "step": 161700 }, { "epoch": 16.18, "grad_norm": 6.52590169920586e-05, "learning_rate": 2.311036789297659e-06, "loss": 0.0019, "step": 161800 }, { "epoch": 16.19, "grad_norm": 5.044812041887781e-06, "learning_rate": 2.309364548494983e-06, "loss": 0.0, "step": 161900 }, { "epoch": 16.2, "grad_norm": 7.806534085830208e-06, "learning_rate": 2.307692307692308e-06, "loss": 0.0, "step": 162000 }, { "epoch": 16.21, "grad_norm": 0.0001349828380625695, "learning_rate": 2.3060200668896325e-06, "loss": 0.0, "step": 162100 }, { "epoch": 16.22, "grad_norm": 5.507646164915059e-06, "learning_rate": 2.3043478260869566e-06, "loss": 0.0, "step": 162200 }, { "epoch": 16.23, "grad_norm": 5.797621724923374e-06, "learning_rate": 2.302675585284281e-06, "loss": 0.0, "step": 162300 }, { "epoch": 16.24, "grad_norm": 7.901490607764572e-05, "learning_rate": 2.3010033444816055e-06, "loss": 0.0, "step": 162400 }, { "epoch": 16.25, "grad_norm": 5.0614430620044e-06, "learning_rate": 2.29933110367893e-06, "loss": 0.0, "step": 162500 }, { "epoch": 16.26, "grad_norm": 9.34005220187828e-06, "learning_rate": 2.2976588628762544e-06, "loss": 0.0, "step": 162600 }, { "epoch": 16.27, "grad_norm": 6.163528450997546e-05, "learning_rate": 2.295986622073579e-06, "loss": 0.0, "step": 162700 }, { "epoch": 16.28, "grad_norm": 4.1326575228595175e-06, "learning_rate": 2.2943143812709033e-06, "loss": 0.0003, "step": 162800 }, { "epoch": 16.29, "grad_norm": 4.5336778384807985e-06, "learning_rate": 2.2926421404682274e-06, "loss": 0.0, "step": 162900 }, { "epoch": 16.3, "grad_norm": 5.2979930842411704e-06, "learning_rate": 2.290969899665552e-06, "loss": 0.003, "step": 163000 }, { "epoch": 16.31, "grad_norm": 0.0012195265153422952, "learning_rate": 2.2892976588628767e-06, "loss": 0.0, "step": 163100 }, { "epoch": 16.32, "grad_norm": 0.003278920892626047, "learning_rate": 2.2876254180602008e-06, "loss": 0.0, "step": 163200 }, { "epoch": 16.33, "grad_norm": 6.400030542863533e-05, "learning_rate": 2.2859531772575252e-06, "loss": 0.0035, "step": 163300 }, { "epoch": 16.34, "grad_norm": 1.2621836503967643e-05, "learning_rate": 2.2842809364548497e-06, "loss": 0.0011, "step": 163400 }, { "epoch": 16.35, "grad_norm": 0.0002729544066824019, "learning_rate": 2.282608695652174e-06, "loss": 0.0, "step": 163500 }, { "epoch": 16.36, "grad_norm": 0.00021667926921509206, "learning_rate": 2.2809364548494986e-06, "loss": 0.0, "step": 163600 }, { "epoch": 16.37, "grad_norm": 4.100193837075494e-05, "learning_rate": 2.279264214046823e-06, "loss": 0.0, "step": 163700 }, { "epoch": 16.38, "grad_norm": 4.1616758608142845e-06, "learning_rate": 2.2775919732441475e-06, "loss": 0.0, "step": 163800 }, { "epoch": 16.39, "grad_norm": 443.37982177734375, "learning_rate": 2.2759197324414716e-06, "loss": 0.0012, "step": 163900 }, { "epoch": 16.4, "grad_norm": 5.086914370622253e-06, "learning_rate": 2.274247491638796e-06, "loss": 0.0035, "step": 164000 }, { "epoch": 16.41, "grad_norm": 6.053800007066457e-06, "learning_rate": 2.2725752508361205e-06, "loss": 0.0, "step": 164100 }, { "epoch": 16.42, "grad_norm": 3.32295780935965e-06, "learning_rate": 2.270903010033445e-06, "loss": 0.0071, "step": 164200 }, { "epoch": 16.43, "grad_norm": 1.0716218639572617e-05, "learning_rate": 2.2692307692307694e-06, "loss": 0.0057, "step": 164300 }, { "epoch": 16.44, "grad_norm": 4.743649242300307e-06, "learning_rate": 2.267558528428094e-06, "loss": 0.0, "step": 164400 }, { "epoch": 16.45, "grad_norm": 0.00011757192260120064, "learning_rate": 2.2658862876254183e-06, "loss": 0.0065, "step": 164500 }, { "epoch": 16.46, "grad_norm": 5.40465880476404e-05, "learning_rate": 2.2642140468227428e-06, "loss": 0.0, "step": 164600 }, { "epoch": 16.47, "grad_norm": 6.664827378699556e-05, "learning_rate": 2.2625418060200672e-06, "loss": 0.0006, "step": 164700 }, { "epoch": 16.48, "grad_norm": 4.517704383033561e-06, "learning_rate": 2.2608695652173913e-06, "loss": 0.0025, "step": 164800 }, { "epoch": 16.49, "grad_norm": 3.920742983609671e-06, "learning_rate": 2.2591973244147157e-06, "loss": 0.0, "step": 164900 }, { "epoch": 16.5, "grad_norm": 7.613572961417958e-06, "learning_rate": 2.25752508361204e-06, "loss": 0.0, "step": 165000 }, { "epoch": 16.5, "eval_accuracy": 0.98765, "eval_f1": 0.98765, "eval_loss": 0.1337709277868271, "eval_runtime": 134.8045, "eval_samples_per_second": 296.726, "eval_steps_per_second": 296.726, "step": 165000 }, { "epoch": 16.51, "grad_norm": 1.6788189896033145e-05, "learning_rate": 2.2558528428093647e-06, "loss": 0.0, "step": 165100 }, { "epoch": 16.52, "grad_norm": 5.329384748620214e-06, "learning_rate": 2.254180602006689e-06, "loss": 0.0, "step": 165200 }, { "epoch": 16.53, "grad_norm": 6.522398052766221e-06, "learning_rate": 2.2525083612040136e-06, "loss": 0.0, "step": 165300 }, { "epoch": 16.54, "grad_norm": 8.143794730131049e-06, "learning_rate": 2.250836120401338e-06, "loss": 0.0, "step": 165400 }, { "epoch": 16.55, "grad_norm": 4.95312488055788e-05, "learning_rate": 2.249163879598662e-06, "loss": 0.0001, "step": 165500 }, { "epoch": 16.56, "grad_norm": 5.945916655036854e-06, "learning_rate": 2.247491638795987e-06, "loss": 0.0037, "step": 165600 }, { "epoch": 16.57, "grad_norm": 0.0021674486342817545, "learning_rate": 2.2458193979933114e-06, "loss": 0.0, "step": 165700 }, { "epoch": 16.58, "grad_norm": 3.8080481317592785e-05, "learning_rate": 2.2441471571906355e-06, "loss": 0.0071, "step": 165800 }, { "epoch": 16.59, "grad_norm": 2.418838812445756e-05, "learning_rate": 2.24247491638796e-06, "loss": 0.0029, "step": 165900 }, { "epoch": 16.6, "grad_norm": 4.79162554256618e-06, "learning_rate": 2.2408026755852844e-06, "loss": 0.0001, "step": 166000 }, { "epoch": 16.61, "grad_norm": 5.906681963097071e-06, "learning_rate": 2.239130434782609e-06, "loss": 0.0082, "step": 166100 }, { "epoch": 16.62, "grad_norm": 7.5729653872258496e-06, "learning_rate": 2.2374581939799333e-06, "loss": 0.0027, "step": 166200 }, { "epoch": 16.63, "grad_norm": 0.00019007065566256642, "learning_rate": 2.2357859531772578e-06, "loss": 0.0, "step": 166300 }, { "epoch": 16.64, "grad_norm": 3.341139017720707e-05, "learning_rate": 2.2341137123745822e-06, "loss": 0.0012, "step": 166400 }, { "epoch": 16.65, "grad_norm": 5.296933977660956e-06, "learning_rate": 2.2324414715719063e-06, "loss": 0.0019, "step": 166500 }, { "epoch": 16.66, "grad_norm": 0.0005588891799561679, "learning_rate": 2.230769230769231e-06, "loss": 0.0035, "step": 166600 }, { "epoch": 16.67, "grad_norm": 0.0007428290555253625, "learning_rate": 2.2290969899665556e-06, "loss": 0.0014, "step": 166700 }, { "epoch": 16.68, "grad_norm": 1.5256994629453402e-05, "learning_rate": 2.2274247491638796e-06, "loss": 0.0, "step": 166800 }, { "epoch": 16.69, "grad_norm": 3.1883606425253674e-05, "learning_rate": 2.225752508361204e-06, "loss": 0.0, "step": 166900 }, { "epoch": 16.7, "grad_norm": 7.867512067605276e-06, "learning_rate": 2.2240802675585286e-06, "loss": 0.0, "step": 167000 }, { "epoch": 16.71, "grad_norm": 6.410631158360047e-06, "learning_rate": 2.222408026755853e-06, "loss": 0.0, "step": 167100 }, { "epoch": 16.72, "grad_norm": 9.276688615500461e-06, "learning_rate": 2.2207357859531775e-06, "loss": 0.0, "step": 167200 }, { "epoch": 16.73, "grad_norm": 6.474059318861691e-06, "learning_rate": 2.219063545150502e-06, "loss": 0.0, "step": 167300 }, { "epoch": 16.74, "grad_norm": 6.038425908627687e-06, "learning_rate": 2.2173913043478264e-06, "loss": 0.0042, "step": 167400 }, { "epoch": 16.75, "grad_norm": 4.583041118166875e-06, "learning_rate": 2.2157190635451504e-06, "loss": 0.0, "step": 167500 }, { "epoch": 16.76, "grad_norm": 5.181601954973303e-05, "learning_rate": 2.2140468227424753e-06, "loss": 0.0, "step": 167600 }, { "epoch": 16.77, "grad_norm": 3.553269925760105e-05, "learning_rate": 2.2123745819397998e-06, "loss": 0.0, "step": 167700 }, { "epoch": 16.78, "grad_norm": 0.00026847337721847, "learning_rate": 2.210702341137124e-06, "loss": 0.0, "step": 167800 }, { "epoch": 16.79, "grad_norm": 0.00020925169519614428, "learning_rate": 2.2090301003344483e-06, "loss": 0.0, "step": 167900 }, { "epoch": 16.8, "grad_norm": 0.00029760925099253654, "learning_rate": 2.2073578595317727e-06, "loss": 0.0, "step": 168000 }, { "epoch": 16.81, "grad_norm": 8.02914200903615e-06, "learning_rate": 2.205685618729097e-06, "loss": 0.0, "step": 168100 }, { "epoch": 16.82, "grad_norm": 4.700968929682858e-06, "learning_rate": 2.2040133779264217e-06, "loss": 0.0, "step": 168200 }, { "epoch": 16.83, "grad_norm": 67.33990478515625, "learning_rate": 2.202341137123746e-06, "loss": 0.0033, "step": 168300 }, { "epoch": 16.84, "grad_norm": 0.00011456763604655862, "learning_rate": 2.2006688963210706e-06, "loss": 0.0, "step": 168400 }, { "epoch": 16.85, "grad_norm": 0.0007877860916778445, "learning_rate": 2.1989966555183946e-06, "loss": 0.0, "step": 168500 }, { "epoch": 16.86, "grad_norm": 2.5768347313714912e-06, "learning_rate": 2.197324414715719e-06, "loss": 0.0, "step": 168600 }, { "epoch": 16.87, "grad_norm": 3.3434796932851896e-05, "learning_rate": 2.1956521739130435e-06, "loss": 0.0, "step": 168700 }, { "epoch": 16.88, "grad_norm": 4.778345555678243e-06, "learning_rate": 2.193979933110368e-06, "loss": 0.0, "step": 168800 }, { "epoch": 16.89, "grad_norm": 4.4025196075381245e-06, "learning_rate": 2.1923076923076925e-06, "loss": 0.0, "step": 168900 }, { "epoch": 16.9, "grad_norm": 3.4835945825761883e-06, "learning_rate": 2.190635451505017e-06, "loss": 0.0, "step": 169000 }, { "epoch": 16.91, "grad_norm": 4.608343715517549e-06, "learning_rate": 2.1889632107023414e-06, "loss": 0.0, "step": 169100 }, { "epoch": 16.92, "grad_norm": 2.3206870537251234e-05, "learning_rate": 2.187290969899666e-06, "loss": 0.0, "step": 169200 }, { "epoch": 16.93, "grad_norm": 1.2776320545526687e-05, "learning_rate": 2.1856187290969903e-06, "loss": 0.0046, "step": 169300 }, { "epoch": 16.94, "grad_norm": 1.215246993524488e-05, "learning_rate": 2.1839464882943143e-06, "loss": 0.0, "step": 169400 }, { "epoch": 16.95, "grad_norm": 4.593052381096641e-06, "learning_rate": 2.182274247491639e-06, "loss": 0.0, "step": 169500 }, { "epoch": 16.96, "grad_norm": 3.894136898452416e-05, "learning_rate": 2.1806020066889633e-06, "loss": 0.0047, "step": 169600 }, { "epoch": 16.97, "grad_norm": 1.1413544598326553e-05, "learning_rate": 2.1789297658862877e-06, "loss": 0.0108, "step": 169700 }, { "epoch": 16.98, "grad_norm": 9.005904757941607e-06, "learning_rate": 2.177257525083612e-06, "loss": 0.0032, "step": 169800 }, { "epoch": 16.99, "grad_norm": 9.968599624698982e-05, "learning_rate": 2.1755852842809366e-06, "loss": 0.0086, "step": 169900 }, { "epoch": 17.0, "grad_norm": 0.00240413099527359, "learning_rate": 2.173913043478261e-06, "loss": 0.003, "step": 170000 }, { "epoch": 17.0, "eval_accuracy": 0.986175, "eval_f1": 0.986175, "eval_loss": 0.14242902398109436, "eval_runtime": 134.4229, "eval_samples_per_second": 297.568, "eval_steps_per_second": 297.568, "step": 170000 }, { "epoch": 17.01, "grad_norm": 1.1759045264625456e-05, "learning_rate": 2.172240802675585e-06, "loss": 0.0109, "step": 170100 }, { "epoch": 17.02, "grad_norm": 6.563326678588055e-06, "learning_rate": 2.17056856187291e-06, "loss": 0.0, "step": 170200 }, { "epoch": 17.03, "grad_norm": 0.054090823978185654, "learning_rate": 2.1688963210702345e-06, "loss": 0.0, "step": 170300 }, { "epoch": 17.04, "grad_norm": 9.796851372811943e-05, "learning_rate": 2.1672240802675585e-06, "loss": 0.0, "step": 170400 }, { "epoch": 17.05, "grad_norm": 4.6282475523184985e-06, "learning_rate": 2.165551839464883e-06, "loss": 0.0, "step": 170500 }, { "epoch": 17.06, "grad_norm": 4.962850198353408e-06, "learning_rate": 2.1638795986622074e-06, "loss": 0.0, "step": 170600 }, { "epoch": 17.07, "grad_norm": 0.00048357073683291674, "learning_rate": 2.162207357859532e-06, "loss": 0.0, "step": 170700 }, { "epoch": 17.08, "grad_norm": 3.0319617508212104e-05, "learning_rate": 2.1605351170568564e-06, "loss": 0.0, "step": 170800 }, { "epoch": 17.09, "grad_norm": 0.0008551038918085396, "learning_rate": 2.158862876254181e-06, "loss": 0.0, "step": 170900 }, { "epoch": 17.1, "grad_norm": 6.037145340087591e-06, "learning_rate": 2.1571906354515053e-06, "loss": 0.0, "step": 171000 }, { "epoch": 17.11, "grad_norm": 6.677675173705211e-06, "learning_rate": 2.1555183946488293e-06, "loss": 0.0001, "step": 171100 }, { "epoch": 17.12, "grad_norm": 5.688702731276862e-05, "learning_rate": 2.153846153846154e-06, "loss": 0.0, "step": 171200 }, { "epoch": 17.13, "grad_norm": 5.400210739026079e-06, "learning_rate": 2.1521739130434787e-06, "loss": 0.003, "step": 171300 }, { "epoch": 17.14, "grad_norm": 7.61369210522389e-06, "learning_rate": 2.1505016722408027e-06, "loss": 0.0001, "step": 171400 }, { "epoch": 17.15, "grad_norm": 4.224335953040281e-06, "learning_rate": 2.148829431438127e-06, "loss": 0.0, "step": 171500 }, { "epoch": 17.16, "grad_norm": 0.0001257136173080653, "learning_rate": 2.1471571906354516e-06, "loss": 0.0, "step": 171600 }, { "epoch": 17.17, "grad_norm": 2.4170622054953128e-05, "learning_rate": 2.145484949832776e-06, "loss": 0.0, "step": 171700 }, { "epoch": 17.18, "grad_norm": 4.489980256039416e-06, "learning_rate": 2.1438127090301005e-06, "loss": 0.0, "step": 171800 }, { "epoch": 17.19, "grad_norm": 3.3881981380545767e-06, "learning_rate": 2.142140468227425e-06, "loss": 0.0, "step": 171900 }, { "epoch": 17.2, "grad_norm": 1.6857806258485653e-05, "learning_rate": 2.1404682274247495e-06, "loss": 0.0, "step": 172000 }, { "epoch": 17.21, "grad_norm": 3.3245811209781095e-05, "learning_rate": 2.1387959866220735e-06, "loss": 0.0, "step": 172100 }, { "epoch": 17.22, "grad_norm": 9.031922672875226e-06, "learning_rate": 2.1371237458193984e-06, "loss": 0.0, "step": 172200 }, { "epoch": 17.23, "grad_norm": 2.7925609629164683e-06, "learning_rate": 2.135451505016723e-06, "loss": 0.0, "step": 172300 }, { "epoch": 17.24, "grad_norm": 6.467849289037986e-06, "learning_rate": 2.133779264214047e-06, "loss": 0.0, "step": 172400 }, { "epoch": 17.25, "grad_norm": 1.820028046495281e-05, "learning_rate": 2.1321070234113713e-06, "loss": 0.0, "step": 172500 }, { "epoch": 17.26, "grad_norm": 5.631500698655145e-06, "learning_rate": 2.130434782608696e-06, "loss": 0.0003, "step": 172600 }, { "epoch": 17.27, "grad_norm": 0.00015545215865131468, "learning_rate": 2.1287625418060203e-06, "loss": 0.0, "step": 172700 }, { "epoch": 17.28, "grad_norm": 0.00011321782949380577, "learning_rate": 2.1270903010033447e-06, "loss": 0.0, "step": 172800 }, { "epoch": 17.29, "grad_norm": 4.420689947437495e-06, "learning_rate": 2.125418060200669e-06, "loss": 0.0, "step": 172900 }, { "epoch": 17.3, "grad_norm": 4.325940608396195e-06, "learning_rate": 2.1237458193979936e-06, "loss": 0.0, "step": 173000 }, { "epoch": 17.31, "grad_norm": 2.6896172130363993e-05, "learning_rate": 2.1220735785953177e-06, "loss": 0.0, "step": 173100 }, { "epoch": 17.32, "grad_norm": 2.51541132456623e-06, "learning_rate": 2.120401337792642e-06, "loss": 0.0, "step": 173200 }, { "epoch": 17.33, "grad_norm": 0.0002507483586668968, "learning_rate": 2.118729096989967e-06, "loss": 0.0, "step": 173300 }, { "epoch": 17.34, "grad_norm": 1.9010116147910594e-06, "learning_rate": 2.117056856187291e-06, "loss": 0.0, "step": 173400 }, { "epoch": 17.35, "grad_norm": 4.968646408087807e-06, "learning_rate": 2.1153846153846155e-06, "loss": 0.0, "step": 173500 }, { "epoch": 17.36, "grad_norm": 2.4034538910200354e-06, "learning_rate": 2.11371237458194e-06, "loss": 0.0, "step": 173600 }, { "epoch": 17.37, "grad_norm": 3.1217680316331098e-06, "learning_rate": 2.1120401337792644e-06, "loss": 0.0, "step": 173700 }, { "epoch": 17.38, "grad_norm": 4.924338099954184e-06, "learning_rate": 2.110367892976589e-06, "loss": 0.0, "step": 173800 }, { "epoch": 17.39, "grad_norm": 3.1763579499966e-06, "learning_rate": 2.1086956521739134e-06, "loss": 0.015, "step": 173900 }, { "epoch": 17.4, "grad_norm": 8.308285032398999e-06, "learning_rate": 2.1070234113712374e-06, "loss": 0.0, "step": 174000 }, { "epoch": 17.41, "grad_norm": 4.124725364818005e-06, "learning_rate": 2.105351170568562e-06, "loss": 0.0039, "step": 174100 }, { "epoch": 17.42, "grad_norm": 1.6264404621324502e-05, "learning_rate": 2.1036789297658863e-06, "loss": 0.0, "step": 174200 }, { "epoch": 17.43, "grad_norm": 1.3464480616676155e-05, "learning_rate": 2.1020066889632108e-06, "loss": 0.0, "step": 174300 }, { "epoch": 17.44, "grad_norm": 3.041270247194916e-06, "learning_rate": 2.1003344481605352e-06, "loss": 0.0, "step": 174400 }, { "epoch": 17.45, "grad_norm": 2.5817935238592327e-06, "learning_rate": 2.0986622073578597e-06, "loss": 0.0, "step": 174500 }, { "epoch": 17.46, "grad_norm": 5.324686935637146e-05, "learning_rate": 2.096989966555184e-06, "loss": 0.0, "step": 174600 }, { "epoch": 17.47, "grad_norm": 8.143258128257003e-06, "learning_rate": 2.095317725752508e-06, "loss": 0.0, "step": 174700 }, { "epoch": 17.48, "grad_norm": 0.00127193215303123, "learning_rate": 2.093645484949833e-06, "loss": 0.0017, "step": 174800 }, { "epoch": 17.49, "grad_norm": 0.003099586581811309, "learning_rate": 2.0919732441471575e-06, "loss": 0.0, "step": 174900 }, { "epoch": 17.5, "grad_norm": 5.266407697490649e-06, "learning_rate": 2.0903010033444816e-06, "loss": 0.0, "step": 175000 }, { "epoch": 17.5, "eval_accuracy": 0.9876, "eval_f1": 0.9876, "eval_loss": 0.14104844629764557, "eval_runtime": 135.6218, "eval_samples_per_second": 294.938, "eval_steps_per_second": 294.938, "step": 175000 }, { "epoch": 17.51, "grad_norm": 5.470873020385625e-06, "learning_rate": 2.088628762541806e-06, "loss": 0.0027, "step": 175100 }, { "epoch": 17.52, "grad_norm": 3.602494871302042e-06, "learning_rate": 2.0869565217391305e-06, "loss": 0.0, "step": 175200 }, { "epoch": 17.53, "grad_norm": 3.7983577385602985e-06, "learning_rate": 2.085284280936455e-06, "loss": 0.0049, "step": 175300 }, { "epoch": 17.54, "grad_norm": 2.3961443730513565e-05, "learning_rate": 2.0836120401337794e-06, "loss": 0.0, "step": 175400 }, { "epoch": 17.55, "grad_norm": 0.0007328629144467413, "learning_rate": 2.081939799331104e-06, "loss": 0.0043, "step": 175500 }, { "epoch": 17.56, "grad_norm": 1.5070129848027136e-05, "learning_rate": 2.0802675585284283e-06, "loss": 0.0028, "step": 175600 }, { "epoch": 17.57, "grad_norm": 7.220142288133502e-05, "learning_rate": 2.0785953177257524e-06, "loss": 0.004, "step": 175700 }, { "epoch": 17.58, "grad_norm": 3.834291419479996e-06, "learning_rate": 2.0769230769230773e-06, "loss": 0.0, "step": 175800 }, { "epoch": 17.59, "grad_norm": 2.9713196454395074e-06, "learning_rate": 2.0752508361204017e-06, "loss": 0.0, "step": 175900 }, { "epoch": 17.6, "grad_norm": 3.5655093597597443e-06, "learning_rate": 2.0735785953177258e-06, "loss": 0.0048, "step": 176000 }, { "epoch": 17.61, "grad_norm": 2.6606401661410928e-05, "learning_rate": 2.0719063545150502e-06, "loss": 0.0051, "step": 176100 }, { "epoch": 17.62, "grad_norm": 8.27614203444682e-05, "learning_rate": 2.0702341137123747e-06, "loss": 0.0, "step": 176200 }, { "epoch": 17.63, "grad_norm": 0.0005782655789516866, "learning_rate": 2.068561872909699e-06, "loss": 0.0056, "step": 176300 }, { "epoch": 17.64, "grad_norm": 4.853478912991704e-06, "learning_rate": 2.0668896321070236e-06, "loss": 0.0, "step": 176400 }, { "epoch": 17.65, "grad_norm": 1.0566937817202415e-05, "learning_rate": 2.065217391304348e-06, "loss": 0.0, "step": 176500 }, { "epoch": 17.66, "grad_norm": 1.4728007045050617e-05, "learning_rate": 2.0635451505016725e-06, "loss": 0.0, "step": 176600 }, { "epoch": 17.67, "grad_norm": 2.444941310386639e-05, "learning_rate": 2.0618729096989966e-06, "loss": 0.004, "step": 176700 }, { "epoch": 17.68, "grad_norm": 0.0010614750208333135, "learning_rate": 2.0602006688963215e-06, "loss": 0.0, "step": 176800 }, { "epoch": 17.69, "grad_norm": 4.313383669796167e-06, "learning_rate": 2.058528428093646e-06, "loss": 0.0, "step": 176900 }, { "epoch": 17.7, "grad_norm": 3.7007937407906866e-06, "learning_rate": 2.05685618729097e-06, "loss": 0.0, "step": 177000 }, { "epoch": 17.71, "grad_norm": 7.082697720761644e-06, "learning_rate": 2.0551839464882944e-06, "loss": 0.0029, "step": 177100 }, { "epoch": 17.72, "grad_norm": 4.934473054163391e-06, "learning_rate": 2.053511705685619e-06, "loss": 0.0003, "step": 177200 }, { "epoch": 17.73, "grad_norm": 1.3510943063010927e-05, "learning_rate": 2.0518394648829433e-06, "loss": 0.0, "step": 177300 }, { "epoch": 17.74, "grad_norm": 4.9949585445574485e-06, "learning_rate": 2.050167224080268e-06, "loss": 0.0, "step": 177400 }, { "epoch": 17.75, "grad_norm": 3.1264180506695993e-06, "learning_rate": 2.0484949832775922e-06, "loss": 0.0, "step": 177500 }, { "epoch": 17.76, "grad_norm": 0.0013070253189653158, "learning_rate": 2.0468227424749167e-06, "loss": 0.0047, "step": 177600 }, { "epoch": 17.77, "grad_norm": 1.3158923138689715e-05, "learning_rate": 2.0451505016722407e-06, "loss": 0.0001, "step": 177700 }, { "epoch": 17.78, "grad_norm": 7.450151315424591e-06, "learning_rate": 2.0434782608695656e-06, "loss": 0.0, "step": 177800 }, { "epoch": 17.79, "grad_norm": 0.00015990401152521372, "learning_rate": 2.04180602006689e-06, "loss": 0.0, "step": 177900 }, { "epoch": 17.8, "grad_norm": 4.039090981677873e-06, "learning_rate": 2.040133779264214e-06, "loss": 0.0052, "step": 178000 }, { "epoch": 17.81, "grad_norm": 1.5695175534347072e-05, "learning_rate": 2.0384615384615386e-06, "loss": 0.0, "step": 178100 }, { "epoch": 17.82, "grad_norm": 0.0006352474447339773, "learning_rate": 2.036789297658863e-06, "loss": 0.0065, "step": 178200 }, { "epoch": 17.83, "grad_norm": 4.0101067497744225e-06, "learning_rate": 2.0351170568561875e-06, "loss": 0.0052, "step": 178300 }, { "epoch": 17.84, "grad_norm": 0.023364724591374397, "learning_rate": 2.033444816053512e-06, "loss": 0.0057, "step": 178400 }, { "epoch": 17.85, "grad_norm": 5.3487925470108166e-05, "learning_rate": 2.0317725752508364e-06, "loss": 0.0, "step": 178500 }, { "epoch": 17.86, "grad_norm": 1.5035124306450598e-05, "learning_rate": 2.030100334448161e-06, "loss": 0.0, "step": 178600 }, { "epoch": 17.87, "grad_norm": 2.6687663194024935e-05, "learning_rate": 2.028428093645485e-06, "loss": 0.0, "step": 178700 }, { "epoch": 17.88, "grad_norm": 1.9049177353736013e-05, "learning_rate": 2.0267558528428094e-06, "loss": 0.0002, "step": 178800 }, { "epoch": 17.89, "grad_norm": 0.0001085298354155384, "learning_rate": 2.025083612040134e-06, "loss": 0.0, "step": 178900 }, { "epoch": 17.9, "grad_norm": 2.8620918328670086e-06, "learning_rate": 2.0234113712374583e-06, "loss": 0.0022, "step": 179000 }, { "epoch": 17.91, "grad_norm": 3.738312079804018e-05, "learning_rate": 2.0217391304347828e-06, "loss": 0.0, "step": 179100 }, { "epoch": 17.92, "grad_norm": 4.336022357165348e-06, "learning_rate": 2.0200668896321072e-06, "loss": 0.0, "step": 179200 }, { "epoch": 17.93, "grad_norm": 3.867320174322231e-06, "learning_rate": 2.0183946488294317e-06, "loss": 0.0, "step": 179300 }, { "epoch": 17.94, "grad_norm": 5.321133357938379e-06, "learning_rate": 2.016722408026756e-06, "loss": 0.0, "step": 179400 }, { "epoch": 17.95, "grad_norm": 0.00027794932248070836, "learning_rate": 2.0150501672240806e-06, "loss": 0.0062, "step": 179500 }, { "epoch": 17.96, "grad_norm": 4.577206254907651e-06, "learning_rate": 2.0133779264214046e-06, "loss": 0.0, "step": 179600 }, { "epoch": 17.97, "grad_norm": 4.166466897004284e-05, "learning_rate": 2.011705685618729e-06, "loss": 0.0, "step": 179700 }, { "epoch": 17.98, "grad_norm": 6.511841547762742e-06, "learning_rate": 2.0100334448160536e-06, "loss": 0.0, "step": 179800 }, { "epoch": 17.99, "grad_norm": 8.989376510726288e-06, "learning_rate": 2.008361204013378e-06, "loss": 0.0, "step": 179900 }, { "epoch": 18.0, "grad_norm": 0.00018079046276398003, "learning_rate": 2.0066889632107025e-06, "loss": 0.0018, "step": 180000 }, { "epoch": 18.0, "eval_accuracy": 0.9858, "eval_f1": 0.9858, "eval_loss": 0.1534043401479721, "eval_runtime": 137.7635, "eval_samples_per_second": 290.353, "eval_steps_per_second": 290.353, "step": 180000 }, { "epoch": 18.01, "grad_norm": 0.00037978720501996577, "learning_rate": 2.005016722408027e-06, "loss": 0.0019, "step": 180100 }, { "epoch": 18.02, "grad_norm": 3.5032473988394486e-06, "learning_rate": 2.0033444816053514e-06, "loss": 0.0, "step": 180200 }, { "epoch": 18.03, "grad_norm": 2.6313730359106557e-06, "learning_rate": 2.0016722408026754e-06, "loss": 0.0, "step": 180300 }, { "epoch": 18.04, "grad_norm": 1.732081000227481e-05, "learning_rate": 2.0000000000000003e-06, "loss": 0.0001, "step": 180400 }, { "epoch": 18.05, "grad_norm": 7.704897143412381e-06, "learning_rate": 1.998327759197325e-06, "loss": 0.0, "step": 180500 }, { "epoch": 18.06, "grad_norm": 2.443795665385551e-06, "learning_rate": 1.996655518394649e-06, "loss": 0.0, "step": 180600 }, { "epoch": 18.07, "grad_norm": 8.326176612172276e-05, "learning_rate": 1.9949832775919733e-06, "loss": 0.0013, "step": 180700 }, { "epoch": 18.08, "grad_norm": 2.8846391160186613e-06, "learning_rate": 1.9933110367892978e-06, "loss": 0.0043, "step": 180800 }, { "epoch": 18.09, "grad_norm": 3.090189056820236e-05, "learning_rate": 1.9916387959866222e-06, "loss": 0.0, "step": 180900 }, { "epoch": 18.1, "grad_norm": 0.0011116014793515205, "learning_rate": 1.9899665551839467e-06, "loss": 0.0, "step": 181000 }, { "epoch": 18.11, "grad_norm": 1.265348146262113e-05, "learning_rate": 1.988294314381271e-06, "loss": 0.0042, "step": 181100 }, { "epoch": 18.12, "grad_norm": 3.777616257139016e-06, "learning_rate": 1.9866220735785956e-06, "loss": 0.0024, "step": 181200 }, { "epoch": 18.13, "grad_norm": 3.274929622421041e-05, "learning_rate": 1.9849498327759196e-06, "loss": 0.0, "step": 181300 }, { "epoch": 18.14, "grad_norm": 2.9154903131711762e-06, "learning_rate": 1.9832775919732445e-06, "loss": 0.0, "step": 181400 }, { "epoch": 18.15, "grad_norm": 6.4856340031838045e-06, "learning_rate": 1.981605351170569e-06, "loss": 0.0052, "step": 181500 }, { "epoch": 18.16, "grad_norm": 5.131582838657778e-06, "learning_rate": 1.979933110367893e-06, "loss": 0.0143, "step": 181600 }, { "epoch": 18.17, "grad_norm": 3.5505281630321406e-06, "learning_rate": 1.9782608695652175e-06, "loss": 0.0022, "step": 181700 }, { "epoch": 18.18, "grad_norm": 4.868303221883252e-06, "learning_rate": 1.976588628762542e-06, "loss": 0.0107, "step": 181800 }, { "epoch": 18.19, "grad_norm": 3.616964022512548e-05, "learning_rate": 1.9749163879598664e-06, "loss": 0.0, "step": 181900 }, { "epoch": 18.2, "grad_norm": 3.501211540424265e-05, "learning_rate": 1.973244147157191e-06, "loss": 0.0, "step": 182000 }, { "epoch": 18.21, "grad_norm": 4.2304021917516366e-05, "learning_rate": 1.9715719063545153e-06, "loss": 0.0, "step": 182100 }, { "epoch": 18.22, "grad_norm": 5.119220077176578e-06, "learning_rate": 1.9698996655518398e-06, "loss": 0.0, "step": 182200 }, { "epoch": 18.23, "grad_norm": 3.877084054693114e-06, "learning_rate": 1.968227424749164e-06, "loss": 0.0, "step": 182300 }, { "epoch": 18.24, "grad_norm": 4.858573811361566e-05, "learning_rate": 1.9665551839464887e-06, "loss": 0.0, "step": 182400 }, { "epoch": 18.25, "grad_norm": 0.00018243178783450276, "learning_rate": 1.964882943143813e-06, "loss": 0.0037, "step": 182500 }, { "epoch": 18.26, "grad_norm": 8.987120963865891e-06, "learning_rate": 1.963210702341137e-06, "loss": 0.0, "step": 182600 }, { "epoch": 18.27, "grad_norm": 3.947538971260656e-06, "learning_rate": 1.9615384615384617e-06, "loss": 0.0076, "step": 182700 }, { "epoch": 18.28, "grad_norm": 0.016083484515547752, "learning_rate": 1.959866220735786e-06, "loss": 0.0, "step": 182800 }, { "epoch": 18.29, "grad_norm": 2.228962148365099e-05, "learning_rate": 1.9581939799331106e-06, "loss": 0.0, "step": 182900 }, { "epoch": 18.3, "grad_norm": 4.8957945182337426e-06, "learning_rate": 1.956521739130435e-06, "loss": 0.0, "step": 183000 }, { "epoch": 18.31, "grad_norm": 3.4853383112931624e-06, "learning_rate": 1.9548494983277595e-06, "loss": 0.005, "step": 183100 }, { "epoch": 18.32, "grad_norm": 0.008418547920882702, "learning_rate": 1.953177257525084e-06, "loss": 0.0033, "step": 183200 }, { "epoch": 18.33, "grad_norm": 6.046983799024019e-06, "learning_rate": 1.951505016722408e-06, "loss": 0.0034, "step": 183300 }, { "epoch": 18.34, "grad_norm": 0.000668312597554177, "learning_rate": 1.9498327759197325e-06, "loss": 0.0, "step": 183400 }, { "epoch": 18.35, "grad_norm": 8.495872862113174e-06, "learning_rate": 1.948160535117057e-06, "loss": 0.0019, "step": 183500 }, { "epoch": 18.36, "grad_norm": 0.0002771254803519696, "learning_rate": 1.9464882943143814e-06, "loss": 0.0, "step": 183600 }, { "epoch": 18.37, "grad_norm": 5.837029220856493e-06, "learning_rate": 1.944816053511706e-06, "loss": 0.0, "step": 183700 }, { "epoch": 18.38, "grad_norm": 0.00012922013411298394, "learning_rate": 1.9431438127090303e-06, "loss": 0.0, "step": 183800 }, { "epoch": 18.39, "grad_norm": 3.7937059005344054e-06, "learning_rate": 1.9414715719063548e-06, "loss": 0.0, "step": 183900 }, { "epoch": 18.4, "grad_norm": 4.895188794762362e-06, "learning_rate": 1.9397993311036792e-06, "loss": 0.0, "step": 184000 }, { "epoch": 18.41, "grad_norm": 1.4537821698468179e-05, "learning_rate": 1.9381270903010037e-06, "loss": 0.0, "step": 184100 }, { "epoch": 18.42, "grad_norm": 4.6436987759079784e-05, "learning_rate": 1.9364548494983277e-06, "loss": 0.0, "step": 184200 }, { "epoch": 18.43, "grad_norm": 1.5509576769545674e-05, "learning_rate": 1.934782608695652e-06, "loss": 0.0, "step": 184300 }, { "epoch": 18.44, "grad_norm": 3.2652471418259665e-05, "learning_rate": 1.9331103678929766e-06, "loss": 0.0, "step": 184400 }, { "epoch": 18.45, "grad_norm": 3.94304743167595e-06, "learning_rate": 1.931438127090301e-06, "loss": 0.0, "step": 184500 }, { "epoch": 18.46, "grad_norm": 4.365731911093462e-06, "learning_rate": 1.9297658862876256e-06, "loss": 0.0051, "step": 184600 }, { "epoch": 18.47, "grad_norm": 6.958713584026555e-06, "learning_rate": 1.92809364548495e-06, "loss": 0.0, "step": 184700 }, { "epoch": 18.48, "grad_norm": 1.526621781522408e-05, "learning_rate": 1.9264214046822745e-06, "loss": 0.0, "step": 184800 }, { "epoch": 18.49, "grad_norm": 6.0657366702798754e-05, "learning_rate": 1.9247491638795985e-06, "loss": 0.0, "step": 184900 }, { "epoch": 18.5, "grad_norm": 7.313900823646691e-06, "learning_rate": 1.9230769230769234e-06, "loss": 0.0, "step": 185000 }, { "epoch": 18.5, "eval_accuracy": 0.986975, "eval_f1": 0.986975, "eval_loss": 0.1347718983888626, "eval_runtime": 139.3214, "eval_samples_per_second": 287.106, "eval_steps_per_second": 287.106, "step": 185000 }, { "epoch": 18.51, "grad_norm": 8.065755537245423e-06, "learning_rate": 1.921404682274248e-06, "loss": 0.0, "step": 185100 }, { "epoch": 18.52, "grad_norm": 3.502688514345209e-06, "learning_rate": 1.919732441471572e-06, "loss": 0.0, "step": 185200 }, { "epoch": 18.53, "grad_norm": 3.190935558450292e-06, "learning_rate": 1.9180602006688964e-06, "loss": 0.0, "step": 185300 }, { "epoch": 18.54, "grad_norm": 1.8046868717647158e-05, "learning_rate": 1.916387959866221e-06, "loss": 0.0, "step": 185400 }, { "epoch": 18.55, "grad_norm": 4.194449957140023e-06, "learning_rate": 1.9147157190635453e-06, "loss": 0.0, "step": 185500 }, { "epoch": 18.56, "grad_norm": 2.685572098926059e-06, "learning_rate": 1.9130434782608697e-06, "loss": 0.0, "step": 185600 }, { "epoch": 18.57, "grad_norm": 7.828857633285224e-05, "learning_rate": 1.911371237458194e-06, "loss": 0.0, "step": 185700 }, { "epoch": 18.58, "grad_norm": 5.445906936074607e-06, "learning_rate": 1.9096989966555187e-06, "loss": 0.0046, "step": 185800 }, { "epoch": 18.59, "grad_norm": 2.773958840407431e-06, "learning_rate": 1.9080267558528427e-06, "loss": 0.0002, "step": 185900 }, { "epoch": 18.6, "grad_norm": 3.370199465280166e-06, "learning_rate": 1.9063545150501676e-06, "loss": 0.0, "step": 186000 }, { "epoch": 18.61, "grad_norm": 2.8800272957596462e-06, "learning_rate": 1.9046822742474918e-06, "loss": 0.0, "step": 186100 }, { "epoch": 18.62, "grad_norm": 4.435024038684787e-06, "learning_rate": 1.9030100334448163e-06, "loss": 0.0, "step": 186200 }, { "epoch": 18.63, "grad_norm": 9.851543836703058e-06, "learning_rate": 1.9013377926421405e-06, "loss": 0.0, "step": 186300 }, { "epoch": 18.64, "grad_norm": 4.3996242311550304e-06, "learning_rate": 1.899665551839465e-06, "loss": 0.0, "step": 186400 }, { "epoch": 18.65, "grad_norm": 3.0937371775507927e-06, "learning_rate": 1.8979933110367897e-06, "loss": 0.0, "step": 186500 }, { "epoch": 18.66, "grad_norm": 3.958779871027218e-06, "learning_rate": 1.896321070234114e-06, "loss": 0.0, "step": 186600 }, { "epoch": 18.67, "grad_norm": 0.00148894009180367, "learning_rate": 1.8946488294314384e-06, "loss": 0.0035, "step": 186700 }, { "epoch": 18.68, "grad_norm": 1.5799363609403372e-05, "learning_rate": 1.8929765886287626e-06, "loss": 0.0, "step": 186800 }, { "epoch": 18.69, "grad_norm": 5.034713922213996e-06, "learning_rate": 1.891304347826087e-06, "loss": 0.0, "step": 186900 }, { "epoch": 18.7, "grad_norm": 2.59518128586933e-06, "learning_rate": 1.8896321070234118e-06, "loss": 0.0, "step": 187000 }, { "epoch": 18.71, "grad_norm": 3.2641801226418465e-06, "learning_rate": 1.887959866220736e-06, "loss": 0.0, "step": 187100 }, { "epoch": 18.72, "grad_norm": 9.082512406166643e-06, "learning_rate": 1.8862876254180605e-06, "loss": 0.0, "step": 187200 }, { "epoch": 18.73, "grad_norm": 2.4530313567083795e-06, "learning_rate": 1.8846153846153847e-06, "loss": 0.0, "step": 187300 }, { "epoch": 18.74, "grad_norm": 3.7139268442842877e-06, "learning_rate": 1.8829431438127092e-06, "loss": 0.0, "step": 187400 }, { "epoch": 18.75, "grad_norm": 0.00024536409182474017, "learning_rate": 1.8812709030100336e-06, "loss": 0.0, "step": 187500 }, { "epoch": 18.76, "grad_norm": 0.0001586981670698151, "learning_rate": 1.879598662207358e-06, "loss": 0.0, "step": 187600 }, { "epoch": 18.77, "grad_norm": 3.170421041431837e-05, "learning_rate": 1.8779264214046826e-06, "loss": 0.0, "step": 187700 }, { "epoch": 18.78, "grad_norm": 3.257045136706438e-06, "learning_rate": 1.8762541806020068e-06, "loss": 0.003, "step": 187800 }, { "epoch": 18.79, "grad_norm": 2.242783239125856e-06, "learning_rate": 1.8745819397993313e-06, "loss": 0.0, "step": 187900 }, { "epoch": 18.8, "grad_norm": 1.1425007869547699e-05, "learning_rate": 1.8729096989966555e-06, "loss": 0.01, "step": 188000 }, { "epoch": 18.81, "grad_norm": 2.598640776341199e-06, "learning_rate": 1.8712374581939802e-06, "loss": 0.0, "step": 188100 }, { "epoch": 18.82, "grad_norm": 0.0007181576220318675, "learning_rate": 1.8695652173913044e-06, "loss": 0.0, "step": 188200 }, { "epoch": 18.83, "grad_norm": 5.5871128097351175e-06, "learning_rate": 1.867892976588629e-06, "loss": 0.0, "step": 188300 }, { "epoch": 18.84, "grad_norm": 2.6241129944537533e-06, "learning_rate": 1.8662207357859531e-06, "loss": 0.0055, "step": 188400 }, { "epoch": 18.85, "grad_norm": 0.0003280396922491491, "learning_rate": 1.8645484949832776e-06, "loss": 0.0058, "step": 188500 }, { "epoch": 18.86, "grad_norm": 6.639120692852885e-05, "learning_rate": 1.8628762541806023e-06, "loss": 0.0053, "step": 188600 }, { "epoch": 18.87, "grad_norm": 3.4430977393640205e-05, "learning_rate": 1.8612040133779265e-06, "loss": 0.0, "step": 188700 }, { "epoch": 18.88, "grad_norm": 2.713867070269771e-05, "learning_rate": 1.859531772575251e-06, "loss": 0.0037, "step": 188800 }, { "epoch": 18.89, "grad_norm": 0.0003621370124164969, "learning_rate": 1.8578595317725752e-06, "loss": 0.0, "step": 188900 }, { "epoch": 18.9, "grad_norm": 2.7829876216856064e-06, "learning_rate": 1.8561872909698997e-06, "loss": 0.0004, "step": 189000 }, { "epoch": 18.91, "grad_norm": 9.860410500550643e-05, "learning_rate": 1.8545150501672244e-06, "loss": 0.0043, "step": 189100 }, { "epoch": 18.92, "grad_norm": 2.7196410883334465e-06, "learning_rate": 1.8528428093645486e-06, "loss": 0.0, "step": 189200 }, { "epoch": 18.93, "grad_norm": 2.777722556857043e-06, "learning_rate": 1.851170568561873e-06, "loss": 0.0, "step": 189300 }, { "epoch": 18.94, "grad_norm": 3.0415455967158778e-06, "learning_rate": 1.8494983277591973e-06, "loss": 0.0, "step": 189400 }, { "epoch": 18.95, "grad_norm": 7.348385224759113e-06, "learning_rate": 1.8478260869565218e-06, "loss": 0.0, "step": 189500 }, { "epoch": 18.96, "grad_norm": 4.2450628825463355e-06, "learning_rate": 1.8461538461538465e-06, "loss": 0.0065, "step": 189600 }, { "epoch": 18.97, "grad_norm": 1.0027314601757098e-05, "learning_rate": 1.8444816053511707e-06, "loss": 0.0004, "step": 189700 }, { "epoch": 18.98, "grad_norm": 4.824078132514842e-06, "learning_rate": 1.8428093645484952e-06, "loss": 0.0, "step": 189800 }, { "epoch": 18.99, "grad_norm": 4.105546395294368e-06, "learning_rate": 1.8411371237458194e-06, "loss": 0.0, "step": 189900 }, { "epoch": 19.0, "grad_norm": 3.5762561765295686e-06, "learning_rate": 1.8394648829431439e-06, "loss": 0.0054, "step": 190000 }, { "epoch": 19.0, "eval_accuracy": 0.9856, "eval_f1": 0.9856, "eval_loss": 0.15510258078575134, "eval_runtime": 137.3776, "eval_samples_per_second": 291.168, "eval_steps_per_second": 291.168, "step": 190000 }, { "epoch": 19.01, "grad_norm": 7.700408787059132e-06, "learning_rate": 1.8377926421404686e-06, "loss": 0.0001, "step": 190100 }, { "epoch": 19.02, "grad_norm": 6.728196694893995e-06, "learning_rate": 1.8361204013377928e-06, "loss": 0.0, "step": 190200 }, { "epoch": 19.03, "grad_norm": 1.0402168300061021e-05, "learning_rate": 1.8344481605351173e-06, "loss": 0.0, "step": 190300 }, { "epoch": 19.04, "grad_norm": 8.382570376852527e-05, "learning_rate": 1.8327759197324415e-06, "loss": 0.0, "step": 190400 }, { "epoch": 19.05, "grad_norm": 2.071788185276091e-05, "learning_rate": 1.831103678929766e-06, "loss": 0.0, "step": 190500 }, { "epoch": 19.06, "grad_norm": 6.8766489675908815e-06, "learning_rate": 1.8294314381270906e-06, "loss": 0.004, "step": 190600 }, { "epoch": 19.07, "grad_norm": 0.0001236250827787444, "learning_rate": 1.8277591973244149e-06, "loss": 0.0, "step": 190700 }, { "epoch": 19.08, "grad_norm": 5.6216026678157505e-06, "learning_rate": 1.8260869565217394e-06, "loss": 0.0056, "step": 190800 }, { "epoch": 19.09, "grad_norm": 2.4372426196350716e-05, "learning_rate": 1.8244147157190636e-06, "loss": 0.0, "step": 190900 }, { "epoch": 19.1, "grad_norm": 0.0024588583037257195, "learning_rate": 1.822742474916388e-06, "loss": 0.0, "step": 191000 }, { "epoch": 19.11, "grad_norm": 2.7965081699221628e-06, "learning_rate": 1.8210702341137127e-06, "loss": 0.0, "step": 191100 }, { "epoch": 19.12, "grad_norm": 1.210660320793977e-05, "learning_rate": 1.819397993311037e-06, "loss": 0.0, "step": 191200 }, { "epoch": 19.13, "grad_norm": 4.8296110435330775e-06, "learning_rate": 1.8177257525083614e-06, "loss": 0.0081, "step": 191300 }, { "epoch": 19.14, "grad_norm": 7.996006024768576e-05, "learning_rate": 1.8160535117056857e-06, "loss": 0.0, "step": 191400 }, { "epoch": 19.15, "grad_norm": 0.00013632285117637366, "learning_rate": 1.8143812709030102e-06, "loss": 0.0, "step": 191500 }, { "epoch": 19.16, "grad_norm": 4.816226282855496e-05, "learning_rate": 1.8127090301003348e-06, "loss": 0.0, "step": 191600 }, { "epoch": 19.17, "grad_norm": 0.00023680346203036606, "learning_rate": 1.811036789297659e-06, "loss": 0.0015, "step": 191700 }, { "epoch": 19.18, "grad_norm": 1.9035480363527313e-05, "learning_rate": 1.8093645484949835e-06, "loss": 0.0, "step": 191800 }, { "epoch": 19.19, "grad_norm": 5.586427050729981e-06, "learning_rate": 1.8076923076923078e-06, "loss": 0.0093, "step": 191900 }, { "epoch": 19.2, "grad_norm": 1.2779854841937777e-05, "learning_rate": 1.8060200668896322e-06, "loss": 0.0082, "step": 192000 }, { "epoch": 19.21, "grad_norm": 3.19543346449791e-06, "learning_rate": 1.8043478260869567e-06, "loss": 0.0, "step": 192100 }, { "epoch": 19.22, "grad_norm": 6.000143457640661e-06, "learning_rate": 1.8026755852842812e-06, "loss": 0.0035, "step": 192200 }, { "epoch": 19.23, "grad_norm": 4.63162396044936e-05, "learning_rate": 1.8010033444816056e-06, "loss": 0.0, "step": 192300 }, { "epoch": 19.24, "grad_norm": 0.004847492091357708, "learning_rate": 1.7993311036789299e-06, "loss": 0.0, "step": 192400 }, { "epoch": 19.25, "grad_norm": 1.812317714211531e-05, "learning_rate": 1.7976588628762543e-06, "loss": 0.0, "step": 192500 }, { "epoch": 19.26, "grad_norm": 6.381682851497317e-06, "learning_rate": 1.7959866220735788e-06, "loss": 0.0, "step": 192600 }, { "epoch": 19.27, "grad_norm": 6.096350261941552e-06, "learning_rate": 1.7943143812709033e-06, "loss": 0.0, "step": 192700 }, { "epoch": 19.28, "grad_norm": 6.9785724008397665e-06, "learning_rate": 1.7926421404682275e-06, "loss": 0.0, "step": 192800 }, { "epoch": 19.29, "grad_norm": 0.00010105594992637634, "learning_rate": 1.790969899665552e-06, "loss": 0.0, "step": 192900 }, { "epoch": 19.3, "grad_norm": 2.523460580050596e-06, "learning_rate": 1.7892976588628764e-06, "loss": 0.0, "step": 193000 }, { "epoch": 19.31, "grad_norm": 6.7169371504860464e-06, "learning_rate": 1.7876254180602007e-06, "loss": 0.0, "step": 193100 }, { "epoch": 19.32, "grad_norm": 0.00013431630213744938, "learning_rate": 1.7859531772575253e-06, "loss": 0.0, "step": 193200 }, { "epoch": 19.33, "grad_norm": 2.817968152157846e-06, "learning_rate": 1.7842809364548496e-06, "loss": 0.0, "step": 193300 }, { "epoch": 19.34, "grad_norm": 5.19740569870919e-06, "learning_rate": 1.782608695652174e-06, "loss": 0.0006, "step": 193400 }, { "epoch": 19.35, "grad_norm": 0.014350646175444126, "learning_rate": 1.7809364548494983e-06, "loss": 0.0015, "step": 193500 }, { "epoch": 19.36, "grad_norm": 7.5378752626420464e-06, "learning_rate": 1.7792642140468228e-06, "loss": 0.0, "step": 193600 }, { "epoch": 19.37, "grad_norm": 2.4976093300210778e-06, "learning_rate": 1.7775919732441474e-06, "loss": 0.0001, "step": 193700 }, { "epoch": 19.38, "grad_norm": 3.9604841731488705e-05, "learning_rate": 1.7759197324414717e-06, "loss": 0.0, "step": 193800 }, { "epoch": 19.39, "grad_norm": 1.6136280464706942e-05, "learning_rate": 1.7742474916387961e-06, "loss": 0.0, "step": 193900 }, { "epoch": 19.4, "grad_norm": 2.822498345267377e-06, "learning_rate": 1.7725752508361204e-06, "loss": 0.0001, "step": 194000 }, { "epoch": 19.41, "grad_norm": 4.150161657889839e-06, "learning_rate": 1.7709030100334449e-06, "loss": 0.0042, "step": 194100 }, { "epoch": 19.42, "grad_norm": 1.879728188214358e-05, "learning_rate": 1.7692307692307695e-06, "loss": 0.0, "step": 194200 }, { "epoch": 19.43, "grad_norm": 8.796157089818735e-06, "learning_rate": 1.7675585284280938e-06, "loss": 0.0, "step": 194300 }, { "epoch": 19.44, "grad_norm": 4.645894023269648e-06, "learning_rate": 1.7658862876254182e-06, "loss": 0.0, "step": 194400 }, { "epoch": 19.45, "grad_norm": 9.539145139569882e-06, "learning_rate": 1.7642140468227425e-06, "loss": 0.0, "step": 194500 }, { "epoch": 19.46, "grad_norm": 2.760291636150214e-06, "learning_rate": 1.762541806020067e-06, "loss": 0.0, "step": 194600 }, { "epoch": 19.47, "grad_norm": 7.266039574460592e-06, "learning_rate": 1.7608695652173916e-06, "loss": 0.0, "step": 194700 }, { "epoch": 19.48, "grad_norm": 2.4798812319204444e-06, "learning_rate": 1.7591973244147159e-06, "loss": 0.0029, "step": 194800 }, { "epoch": 19.49, "grad_norm": 2.3052723918226548e-05, "learning_rate": 1.7575250836120403e-06, "loss": 0.0, "step": 194900 }, { "epoch": 19.5, "grad_norm": 5.209206574363634e-05, "learning_rate": 1.7558528428093646e-06, "loss": 0.0, "step": 195000 }, { "epoch": 19.5, "eval_accuracy": 0.987475, "eval_f1": 0.987475, "eval_loss": 0.1356872022151947, "eval_runtime": 135.0112, "eval_samples_per_second": 296.272, "eval_steps_per_second": 296.272, "step": 195000 }, { "epoch": 19.51, "grad_norm": 8.632729986857157e-06, "learning_rate": 1.754180602006689e-06, "loss": 0.0, "step": 195100 }, { "epoch": 19.52, "grad_norm": 5.0513413043518085e-06, "learning_rate": 1.7525083612040137e-06, "loss": 0.0, "step": 195200 }, { "epoch": 19.53, "grad_norm": 3.816026492131641e-06, "learning_rate": 1.750836120401338e-06, "loss": 0.0, "step": 195300 }, { "epoch": 19.54, "grad_norm": 1.888971496555314e-06, "learning_rate": 1.7491638795986624e-06, "loss": 0.0, "step": 195400 }, { "epoch": 19.55, "grad_norm": 0.0008715542498975992, "learning_rate": 1.7474916387959867e-06, "loss": 0.0, "step": 195500 }, { "epoch": 19.56, "grad_norm": 2.565695467637852e-05, "learning_rate": 1.7458193979933111e-06, "loss": 0.0, "step": 195600 }, { "epoch": 19.57, "grad_norm": 7.099295089574298e-06, "learning_rate": 1.7441471571906358e-06, "loss": 0.0039, "step": 195700 }, { "epoch": 19.58, "grad_norm": 3.0402254651562544e-06, "learning_rate": 1.74247491638796e-06, "loss": 0.0, "step": 195800 }, { "epoch": 19.59, "grad_norm": 3.428632680879673e-06, "learning_rate": 1.7408026755852845e-06, "loss": 0.0, "step": 195900 }, { "epoch": 19.6, "grad_norm": 7.307850410143146e-06, "learning_rate": 1.7391304347826088e-06, "loss": 0.0, "step": 196000 }, { "epoch": 19.61, "grad_norm": 2.104581108142156e-05, "learning_rate": 1.7374581939799332e-06, "loss": 0.0005, "step": 196100 }, { "epoch": 19.62, "grad_norm": 7.845720574550796e-06, "learning_rate": 1.7357859531772579e-06, "loss": 0.0, "step": 196200 }, { "epoch": 19.63, "grad_norm": 2.6197665192739805e-06, "learning_rate": 1.7341137123745821e-06, "loss": 0.0054, "step": 196300 }, { "epoch": 19.64, "grad_norm": 0.00823716539889574, "learning_rate": 1.7324414715719066e-06, "loss": 0.0043, "step": 196400 }, { "epoch": 19.65, "grad_norm": 5.974598479951965e-06, "learning_rate": 1.7307692307692308e-06, "loss": 0.006, "step": 196500 }, { "epoch": 19.66, "grad_norm": 6.843105802545324e-05, "learning_rate": 1.7290969899665553e-06, "loss": 0.0, "step": 196600 }, { "epoch": 19.67, "grad_norm": 1.3177969776734244e-05, "learning_rate": 1.7274247491638798e-06, "loss": 0.0, "step": 196700 }, { "epoch": 19.68, "grad_norm": 1.1815620382549241e-05, "learning_rate": 1.7257525083612042e-06, "loss": 0.0034, "step": 196800 }, { "epoch": 19.69, "grad_norm": 1.2062504538334906e-05, "learning_rate": 1.7240802675585287e-06, "loss": 0.0, "step": 196900 }, { "epoch": 19.7, "grad_norm": 1.7363923689117655e-05, "learning_rate": 1.722408026755853e-06, "loss": 0.0, "step": 197000 }, { "epoch": 19.71, "grad_norm": 0.003212525974959135, "learning_rate": 1.7207357859531774e-06, "loss": 0.0, "step": 197100 }, { "epoch": 19.72, "grad_norm": 2.8709598609566456e-06, "learning_rate": 1.7190635451505019e-06, "loss": 0.0, "step": 197200 }, { "epoch": 19.73, "grad_norm": 3.688925971800927e-06, "learning_rate": 1.7173913043478263e-06, "loss": 0.0, "step": 197300 }, { "epoch": 19.74, "grad_norm": 4.4789187086280435e-05, "learning_rate": 1.7157190635451506e-06, "loss": 0.0, "step": 197400 }, { "epoch": 19.75, "grad_norm": 0.010275106877088547, "learning_rate": 1.714046822742475e-06, "loss": 0.0, "step": 197500 }, { "epoch": 19.76, "grad_norm": 3.637941290435265e-06, "learning_rate": 1.7123745819397995e-06, "loss": 0.0, "step": 197600 }, { "epoch": 19.77, "grad_norm": 8.29253167466959e-06, "learning_rate": 1.710702341137124e-06, "loss": 0.0018, "step": 197700 }, { "epoch": 19.78, "grad_norm": 3.868249223160092e-06, "learning_rate": 1.7090301003344484e-06, "loss": 0.0, "step": 197800 }, { "epoch": 19.79, "grad_norm": 3.6814990380662493e-06, "learning_rate": 1.7073578595317727e-06, "loss": 0.0, "step": 197900 }, { "epoch": 19.8, "grad_norm": 1.3625873179989867e-05, "learning_rate": 1.7056856187290971e-06, "loss": 0.0, "step": 198000 }, { "epoch": 19.81, "grad_norm": 0.0058494736440479755, "learning_rate": 1.7040133779264214e-06, "loss": 0.0063, "step": 198100 }, { "epoch": 19.82, "grad_norm": 3.438921112319804e-06, "learning_rate": 1.7023411371237458e-06, "loss": 0.0, "step": 198200 }, { "epoch": 19.83, "grad_norm": 0.0007142440881580114, "learning_rate": 1.7006688963210705e-06, "loss": 0.0, "step": 198300 }, { "epoch": 19.84, "grad_norm": 1.3060711353318766e-05, "learning_rate": 1.6989966555183947e-06, "loss": 0.0, "step": 198400 }, { "epoch": 19.85, "grad_norm": 0.007419825531542301, "learning_rate": 1.6973244147157192e-06, "loss": 0.0015, "step": 198500 }, { "epoch": 19.86, "grad_norm": 2.1179796476644697e-06, "learning_rate": 1.6956521739130435e-06, "loss": 0.0, "step": 198600 }, { "epoch": 19.87, "grad_norm": 4.210245151625713e-06, "learning_rate": 1.693979933110368e-06, "loss": 0.0, "step": 198700 }, { "epoch": 19.88, "grad_norm": 2.1146457584109157e-05, "learning_rate": 1.6923076923076926e-06, "loss": 0.0, "step": 198800 }, { "epoch": 19.89, "grad_norm": 2.6942605018120958e-06, "learning_rate": 1.6906354515050168e-06, "loss": 0.0041, "step": 198900 }, { "epoch": 19.9, "grad_norm": 2.950105226773303e-06, "learning_rate": 1.6889632107023413e-06, "loss": 0.0, "step": 199000 }, { "epoch": 19.91, "grad_norm": 3.4126630907849176e-06, "learning_rate": 1.6872909698996655e-06, "loss": 0.0, "step": 199100 }, { "epoch": 19.92, "grad_norm": 3.211943521819194e-06, "learning_rate": 1.68561872909699e-06, "loss": 0.0, "step": 199200 }, { "epoch": 19.93, "grad_norm": 4.190772961010225e-05, "learning_rate": 1.6839464882943147e-06, "loss": 0.0, "step": 199300 }, { "epoch": 19.94, "grad_norm": 0.0004158316587563604, "learning_rate": 1.682274247491639e-06, "loss": 0.0, "step": 199400 }, { "epoch": 19.95, "grad_norm": 3.3130665997305186e-06, "learning_rate": 1.6806020066889634e-06, "loss": 0.0, "step": 199500 }, { "epoch": 19.96, "grad_norm": 3.0582334602513583e-06, "learning_rate": 1.6789297658862876e-06, "loss": 0.0, "step": 199600 }, { "epoch": 19.97, "grad_norm": 3.249564770158031e-06, "learning_rate": 1.677257525083612e-06, "loss": 0.0, "step": 199700 }, { "epoch": 19.98, "grad_norm": 2.1675789412256563e-06, "learning_rate": 1.6755852842809368e-06, "loss": 0.0, "step": 199800 }, { "epoch": 19.99, "grad_norm": 3.1598647183272988e-06, "learning_rate": 1.673913043478261e-06, "loss": 0.0, "step": 199900 }, { "epoch": 20.0, "grad_norm": 2.6797370082931593e-06, "learning_rate": 1.6722408026755855e-06, "loss": 0.0, "step": 200000 }, { "epoch": 20.0, "eval_accuracy": 0.98775, "eval_f1": 0.98775, "eval_loss": 0.13826440274715424, "eval_runtime": 133.9725, "eval_samples_per_second": 298.569, "eval_steps_per_second": 298.569, "step": 200000 }, { "epoch": 20.01, "grad_norm": 4.444530532055069e-06, "learning_rate": 1.6705685618729097e-06, "loss": 0.0, "step": 200100 }, { "epoch": 20.02, "grad_norm": 2.957669721581624e-06, "learning_rate": 1.6688963210702342e-06, "loss": 0.0, "step": 200200 }, { "epoch": 20.03, "grad_norm": 3.291775101388339e-06, "learning_rate": 1.6672240802675589e-06, "loss": 0.0, "step": 200300 }, { "epoch": 20.04, "grad_norm": 2.782702722470276e-06, "learning_rate": 1.6655518394648831e-06, "loss": 0.0031, "step": 200400 }, { "epoch": 20.05, "grad_norm": 8.329800039064139e-06, "learning_rate": 1.6638795986622076e-06, "loss": 0.0031, "step": 200500 }, { "epoch": 20.06, "grad_norm": 3.788985168284853e-06, "learning_rate": 1.6622073578595318e-06, "loss": 0.0, "step": 200600 }, { "epoch": 20.07, "grad_norm": 3.6758290207217215e-06, "learning_rate": 1.6605351170568563e-06, "loss": 0.0, "step": 200700 }, { "epoch": 20.08, "grad_norm": 3.0887517823430244e-06, "learning_rate": 1.658862876254181e-06, "loss": 0.0, "step": 200800 }, { "epoch": 20.09, "grad_norm": 6.320357351796702e-05, "learning_rate": 1.6571906354515052e-06, "loss": 0.0, "step": 200900 }, { "epoch": 20.1, "grad_norm": 2.65393919107737e-06, "learning_rate": 1.6555183946488297e-06, "loss": 0.0, "step": 201000 }, { "epoch": 20.11, "grad_norm": 3.1102713364816736e-06, "learning_rate": 1.653846153846154e-06, "loss": 0.0, "step": 201100 }, { "epoch": 20.12, "grad_norm": 2.6238412829115987e-06, "learning_rate": 1.6521739130434784e-06, "loss": 0.0112, "step": 201200 }, { "epoch": 20.13, "grad_norm": 2.09651898330776e-06, "learning_rate": 1.6505016722408028e-06, "loss": 0.0, "step": 201300 }, { "epoch": 20.14, "grad_norm": 2.435858732496854e-06, "learning_rate": 1.6488294314381273e-06, "loss": 0.0, "step": 201400 }, { "epoch": 20.15, "grad_norm": 4.056108537042746e-06, "learning_rate": 1.6471571906354518e-06, "loss": 0.0063, "step": 201500 }, { "epoch": 20.16, "grad_norm": 4.716017429018393e-06, "learning_rate": 1.645484949832776e-06, "loss": 0.0003, "step": 201600 }, { "epoch": 20.17, "grad_norm": 4.022814209747594e-06, "learning_rate": 1.6438127090301005e-06, "loss": 0.0, "step": 201700 }, { "epoch": 20.18, "grad_norm": 1.1780533895944245e-05, "learning_rate": 1.642140468227425e-06, "loss": 0.011, "step": 201800 }, { "epoch": 20.19, "grad_norm": 3.5756809211306972e-06, "learning_rate": 1.6404682274247494e-06, "loss": 0.0023, "step": 201900 }, { "epoch": 20.2, "grad_norm": 3.5133691653754795e-06, "learning_rate": 1.6387959866220736e-06, "loss": 0.0, "step": 202000 }, { "epoch": 20.21, "grad_norm": 2.1817815650138073e-05, "learning_rate": 1.637123745819398e-06, "loss": 0.0, "step": 202100 }, { "epoch": 20.22, "grad_norm": 7.410139369312674e-05, "learning_rate": 1.6354515050167226e-06, "loss": 0.0059, "step": 202200 }, { "epoch": 20.23, "grad_norm": 1.0194399692409206e-05, "learning_rate": 1.633779264214047e-06, "loss": 0.0, "step": 202300 }, { "epoch": 20.24, "grad_norm": 2.9090826956235105e-06, "learning_rate": 1.6321070234113715e-06, "loss": 0.0, "step": 202400 }, { "epoch": 20.25, "grad_norm": 1.4217706848285161e-05, "learning_rate": 1.6304347826086957e-06, "loss": 0.0, "step": 202500 }, { "epoch": 20.26, "grad_norm": 4.9936047616938595e-06, "learning_rate": 1.6287625418060202e-06, "loss": 0.0, "step": 202600 }, { "epoch": 20.27, "grad_norm": 2.9128027563274372e-06, "learning_rate": 1.6270903010033444e-06, "loss": 0.0, "step": 202700 }, { "epoch": 20.28, "grad_norm": 4.234791049384512e-06, "learning_rate": 1.625418060200669e-06, "loss": 0.0076, "step": 202800 }, { "epoch": 20.29, "grad_norm": 2.635060582178994e-06, "learning_rate": 1.6237458193979936e-06, "loss": 0.0, "step": 202900 }, { "epoch": 20.3, "grad_norm": 2.543606751714833e-05, "learning_rate": 1.6220735785953178e-06, "loss": 0.0, "step": 203000 }, { "epoch": 20.31, "grad_norm": 1.7358966942992993e-05, "learning_rate": 1.6204013377926423e-06, "loss": 0.0002, "step": 203100 }, { "epoch": 20.32, "grad_norm": 0.0010591618483886123, "learning_rate": 1.6187290969899665e-06, "loss": 0.0, "step": 203200 }, { "epoch": 20.33, "grad_norm": 7.357494723692071e-06, "learning_rate": 1.617056856187291e-06, "loss": 0.0, "step": 203300 }, { "epoch": 20.34, "grad_norm": 2.521198803151492e-05, "learning_rate": 1.6153846153846157e-06, "loss": 0.0036, "step": 203400 }, { "epoch": 20.35, "grad_norm": 4.66443998448085e-06, "learning_rate": 1.61371237458194e-06, "loss": 0.0, "step": 203500 }, { "epoch": 20.36, "grad_norm": 0.00010037265747087076, "learning_rate": 1.6120401337792644e-06, "loss": 0.0019, "step": 203600 }, { "epoch": 20.37, "grad_norm": 0.0002850138407666236, "learning_rate": 1.6103678929765886e-06, "loss": 0.0, "step": 203700 }, { "epoch": 20.38, "grad_norm": 6.3181232690112665e-06, "learning_rate": 1.608695652173913e-06, "loss": 0.0, "step": 203800 }, { "epoch": 20.39, "grad_norm": 9.268571375287138e-06, "learning_rate": 1.6070234113712377e-06, "loss": 0.0, "step": 203900 }, { "epoch": 20.4, "grad_norm": 3.4747040444926824e-06, "learning_rate": 1.605351170568562e-06, "loss": 0.0035, "step": 204000 }, { "epoch": 20.41, "grad_norm": 2.8883071081509115e-06, "learning_rate": 1.6036789297658865e-06, "loss": 0.0, "step": 204100 }, { "epoch": 20.42, "grad_norm": 6.735133865731768e-06, "learning_rate": 1.6020066889632107e-06, "loss": 0.0038, "step": 204200 }, { "epoch": 20.43, "grad_norm": 2.2144862668938003e-05, "learning_rate": 1.6003344481605352e-06, "loss": 0.0001, "step": 204300 }, { "epoch": 20.44, "grad_norm": 3.1439285521628335e-06, "learning_rate": 1.5986622073578598e-06, "loss": 0.001, "step": 204400 }, { "epoch": 20.45, "grad_norm": 5.965619129710831e-05, "learning_rate": 1.596989966555184e-06, "loss": 0.0018, "step": 204500 }, { "epoch": 20.46, "grad_norm": 2.0531138943624683e-05, "learning_rate": 1.5953177257525085e-06, "loss": 0.0, "step": 204600 }, { "epoch": 20.47, "grad_norm": 1.9690560293383896e-06, "learning_rate": 1.5936454849498328e-06, "loss": 0.0, "step": 204700 }, { "epoch": 20.48, "grad_norm": 1.1894359886355232e-05, "learning_rate": 1.5919732441471573e-06, "loss": 0.0008, "step": 204800 }, { "epoch": 20.49, "grad_norm": 3.1937240692059277e-06, "learning_rate": 1.590301003344482e-06, "loss": 0.0, "step": 204900 }, { "epoch": 20.5, "grad_norm": 3.63977778761182e-05, "learning_rate": 1.5886287625418062e-06, "loss": 0.0, "step": 205000 }, { "epoch": 20.5, "eval_accuracy": 0.987575, "eval_f1": 0.987575, "eval_loss": 0.13013528287410736, "eval_runtime": 133.1301, "eval_samples_per_second": 300.458, "eval_steps_per_second": 300.458, "step": 205000 }, { "epoch": 20.51, "grad_norm": 0.0012186990352347493, "learning_rate": 1.5869565217391306e-06, "loss": 0.0, "step": 205100 }, { "epoch": 20.52, "grad_norm": 3.058609763684217e-06, "learning_rate": 1.5852842809364549e-06, "loss": 0.0, "step": 205200 }, { "epoch": 20.53, "grad_norm": 4.367856035969453e-06, "learning_rate": 1.5836120401337793e-06, "loss": 0.005, "step": 205300 }, { "epoch": 20.54, "grad_norm": 7.178983196354238e-06, "learning_rate": 1.581939799331104e-06, "loss": 0.0063, "step": 205400 }, { "epoch": 20.55, "grad_norm": 2.945566848211456e-06, "learning_rate": 1.5802675585284283e-06, "loss": 0.0, "step": 205500 }, { "epoch": 20.56, "grad_norm": 5.289690761856036e-06, "learning_rate": 1.5785953177257527e-06, "loss": 0.0, "step": 205600 }, { "epoch": 20.57, "grad_norm": 5.60561784368474e-06, "learning_rate": 1.576923076923077e-06, "loss": 0.0, "step": 205700 }, { "epoch": 20.58, "grad_norm": 2.395588580839103e-06, "learning_rate": 1.5752508361204014e-06, "loss": 0.0, "step": 205800 }, { "epoch": 20.59, "grad_norm": 4.692345100920647e-06, "learning_rate": 1.573578595317726e-06, "loss": 0.0032, "step": 205900 }, { "epoch": 20.6, "grad_norm": 5.715948645956814e-06, "learning_rate": 1.5719063545150504e-06, "loss": 0.0, "step": 206000 }, { "epoch": 20.61, "grad_norm": 3.817430297203828e-06, "learning_rate": 1.5702341137123748e-06, "loss": 0.0, "step": 206100 }, { "epoch": 20.62, "grad_norm": 1.3704256161872763e-05, "learning_rate": 1.568561872909699e-06, "loss": 0.0002, "step": 206200 }, { "epoch": 20.63, "grad_norm": 2.3260201487573795e-05, "learning_rate": 1.5668896321070235e-06, "loss": 0.0, "step": 206300 }, { "epoch": 20.64, "grad_norm": 3.091832013524254e-06, "learning_rate": 1.565217391304348e-06, "loss": 0.0, "step": 206400 }, { "epoch": 20.65, "grad_norm": 5.030944521422498e-05, "learning_rate": 1.5635451505016724e-06, "loss": 0.0087, "step": 206500 }, { "epoch": 20.66, "grad_norm": 4.412408088683151e-05, "learning_rate": 1.5618729096989967e-06, "loss": 0.0, "step": 206600 }, { "epoch": 20.67, "grad_norm": 6.452126854128437e-06, "learning_rate": 1.5602006688963212e-06, "loss": 0.0, "step": 206700 }, { "epoch": 20.68, "grad_norm": 0.0005374864558689296, "learning_rate": 1.5585284280936456e-06, "loss": 0.0046, "step": 206800 }, { "epoch": 20.69, "grad_norm": 8.672196418046951e-05, "learning_rate": 1.55685618729097e-06, "loss": 0.0, "step": 206900 }, { "epoch": 20.7, "grad_norm": 3.4972249522979837e-06, "learning_rate": 1.5551839464882945e-06, "loss": 0.0, "step": 207000 }, { "epoch": 20.71, "grad_norm": 6.360653515002923e-06, "learning_rate": 1.5535117056856188e-06, "loss": 0.0, "step": 207100 }, { "epoch": 20.72, "grad_norm": 9.319689525000285e-06, "learning_rate": 1.5518394648829432e-06, "loss": 0.0, "step": 207200 }, { "epoch": 20.73, "grad_norm": 3.967582870245678e-06, "learning_rate": 1.5501672240802675e-06, "loss": 0.0, "step": 207300 }, { "epoch": 20.74, "grad_norm": 5.464647074404638e-06, "learning_rate": 1.5484949832775922e-06, "loss": 0.0, "step": 207400 }, { "epoch": 20.75, "grad_norm": 1.3912532267568167e-05, "learning_rate": 1.5468227424749166e-06, "loss": 0.0, "step": 207500 }, { "epoch": 20.76, "grad_norm": 1.1850606824737042e-05, "learning_rate": 1.5451505016722409e-06, "loss": 0.0, "step": 207600 }, { "epoch": 20.77, "grad_norm": 4.910432380711427e-06, "learning_rate": 1.5434782608695653e-06, "loss": 0.0, "step": 207700 }, { "epoch": 20.78, "grad_norm": 3.082827106482e-06, "learning_rate": 1.5418060200668896e-06, "loss": 0.0, "step": 207800 }, { "epoch": 20.79, "grad_norm": 5.340507868822897e-06, "learning_rate": 1.540133779264214e-06, "loss": 0.0, "step": 207900 }, { "epoch": 20.8, "grad_norm": 2.699596734601073e-06, "learning_rate": 1.5384615384615387e-06, "loss": 0.0, "step": 208000 }, { "epoch": 20.81, "grad_norm": 7.638249371666461e-06, "learning_rate": 1.536789297658863e-06, "loss": 0.0105, "step": 208100 }, { "epoch": 20.82, "grad_norm": 2.4105136617436074e-05, "learning_rate": 1.5351170568561874e-06, "loss": 0.0017, "step": 208200 }, { "epoch": 20.83, "grad_norm": 7.558439392596483e-06, "learning_rate": 1.5334448160535117e-06, "loss": 0.0, "step": 208300 }, { "epoch": 20.84, "grad_norm": 5.754342055297457e-06, "learning_rate": 1.5317725752508361e-06, "loss": 0.0, "step": 208400 }, { "epoch": 20.85, "grad_norm": 4.127535430598073e-05, "learning_rate": 1.5301003344481608e-06, "loss": 0.0, "step": 208500 }, { "epoch": 20.86, "grad_norm": 2.812850652844645e-06, "learning_rate": 1.528428093645485e-06, "loss": 0.0003, "step": 208600 }, { "epoch": 20.87, "grad_norm": 5.573003363679163e-06, "learning_rate": 1.5267558528428095e-06, "loss": 0.0, "step": 208700 }, { "epoch": 20.88, "grad_norm": 9.813478754949756e-06, "learning_rate": 1.5250836120401338e-06, "loss": 0.0, "step": 208800 }, { "epoch": 20.89, "grad_norm": 1.7837548512034118e-05, "learning_rate": 1.5234113712374582e-06, "loss": 0.0, "step": 208900 }, { "epoch": 20.9, "grad_norm": 2.806435077218339e-06, "learning_rate": 1.521739130434783e-06, "loss": 0.0, "step": 209000 }, { "epoch": 20.91, "grad_norm": 2.905662768171169e-06, "learning_rate": 1.5200668896321071e-06, "loss": 0.0, "step": 209100 }, { "epoch": 20.92, "grad_norm": 0.005608703941106796, "learning_rate": 1.5183946488294316e-06, "loss": 0.0057, "step": 209200 }, { "epoch": 20.93, "grad_norm": 4.323443590692477e-06, "learning_rate": 1.5167224080267559e-06, "loss": 0.0, "step": 209300 }, { "epoch": 20.94, "grad_norm": 2.9272045139805414e-05, "learning_rate": 1.5150501672240803e-06, "loss": 0.0033, "step": 209400 }, { "epoch": 20.95, "grad_norm": 1.3083412341075018e-05, "learning_rate": 1.513377926421405e-06, "loss": 0.0058, "step": 209500 }, { "epoch": 20.96, "grad_norm": 5.5376938689732924e-05, "learning_rate": 1.5117056856187292e-06, "loss": 0.0068, "step": 209600 }, { "epoch": 20.97, "grad_norm": 1.1804182577179745e-05, "learning_rate": 1.5100334448160537e-06, "loss": 0.0, "step": 209700 }, { "epoch": 20.98, "grad_norm": 0.0001499229110777378, "learning_rate": 1.508361204013378e-06, "loss": 0.0, "step": 209800 }, { "epoch": 20.99, "grad_norm": 3.3819417240010807e-06, "learning_rate": 1.5066889632107024e-06, "loss": 0.0, "step": 209900 }, { "epoch": 21.0, "grad_norm": 0.0023802791256457567, "learning_rate": 1.505016722408027e-06, "loss": 0.0, "step": 210000 }, { "epoch": 21.0, "eval_accuracy": 0.987625, "eval_f1": 0.987625, "eval_loss": 0.12249647080898285, "eval_runtime": 135.0345, "eval_samples_per_second": 296.221, "eval_steps_per_second": 296.221, "step": 210000 }, { "epoch": 21.01, "grad_norm": 6.9270877247618046e-06, "learning_rate": 1.5033444816053513e-06, "loss": 0.0, "step": 210100 }, { "epoch": 21.02, "grad_norm": 3.770094917854294e-05, "learning_rate": 1.5016722408026758e-06, "loss": 0.0, "step": 210200 }, { "epoch": 21.03, "grad_norm": 1.953655737452209e-05, "learning_rate": 1.5e-06, "loss": 0.0, "step": 210300 }, { "epoch": 21.04, "grad_norm": 0.0002708936808630824, "learning_rate": 1.4983277591973245e-06, "loss": 0.0, "step": 210400 }, { "epoch": 21.05, "grad_norm": 6.020537512085866e-06, "learning_rate": 1.4966555183946492e-06, "loss": 0.0, "step": 210500 }, { "epoch": 21.06, "grad_norm": 0.03410932794213295, "learning_rate": 1.4949832775919734e-06, "loss": 0.0, "step": 210600 }, { "epoch": 21.07, "grad_norm": 1.6557878552703187e-05, "learning_rate": 1.4933110367892979e-06, "loss": 0.0, "step": 210700 }, { "epoch": 21.08, "grad_norm": 3.707931682583876e-06, "learning_rate": 1.4916387959866221e-06, "loss": 0.0, "step": 210800 }, { "epoch": 21.09, "grad_norm": 4.064024324179627e-06, "learning_rate": 1.4899665551839466e-06, "loss": 0.0024, "step": 210900 }, { "epoch": 21.1, "grad_norm": 3.917386493412778e-06, "learning_rate": 1.488294314381271e-06, "loss": 0.0, "step": 211000 }, { "epoch": 21.11, "grad_norm": 5.051691459811991e-06, "learning_rate": 1.4866220735785955e-06, "loss": 0.0, "step": 211100 }, { "epoch": 21.12, "grad_norm": 1.3631856745632831e-05, "learning_rate": 1.4849498327759198e-06, "loss": 0.0, "step": 211200 }, { "epoch": 21.13, "grad_norm": 4.376207925815834e-06, "learning_rate": 1.4832775919732442e-06, "loss": 0.0, "step": 211300 }, { "epoch": 21.14, "grad_norm": 0.0007653874927200377, "learning_rate": 1.4816053511705687e-06, "loss": 0.0, "step": 211400 }, { "epoch": 21.15, "grad_norm": 0.0003364417643751949, "learning_rate": 1.4799331103678931e-06, "loss": 0.0055, "step": 211500 }, { "epoch": 21.16, "grad_norm": 3.0988155685918173e-06, "learning_rate": 1.4782608695652176e-06, "loss": 0.0015, "step": 211600 }, { "epoch": 21.17, "grad_norm": 3.886037575284718e-06, "learning_rate": 1.4765886287625418e-06, "loss": 0.0, "step": 211700 }, { "epoch": 21.18, "grad_norm": 9.195034181175288e-06, "learning_rate": 1.4749163879598663e-06, "loss": 0.0041, "step": 211800 }, { "epoch": 21.19, "grad_norm": 5.210130439081695e-06, "learning_rate": 1.4732441471571906e-06, "loss": 0.0027, "step": 211900 }, { "epoch": 21.2, "grad_norm": 8.08697677712189e-06, "learning_rate": 1.4715719063545152e-06, "loss": 0.0, "step": 212000 }, { "epoch": 21.21, "grad_norm": 3.7482986954273656e-06, "learning_rate": 1.4698996655518397e-06, "loss": 0.0001, "step": 212100 }, { "epoch": 21.22, "grad_norm": 1.588550367159769e-05, "learning_rate": 1.468227424749164e-06, "loss": 0.0, "step": 212200 }, { "epoch": 21.23, "grad_norm": 0.00480658421292901, "learning_rate": 1.4665551839464884e-06, "loss": 0.0, "step": 212300 }, { "epoch": 21.24, "grad_norm": 7.852279850339983e-06, "learning_rate": 1.4648829431438126e-06, "loss": 0.0, "step": 212400 }, { "epoch": 21.25, "grad_norm": 6.503975600935519e-05, "learning_rate": 1.4632107023411373e-06, "loss": 0.0047, "step": 212500 }, { "epoch": 21.26, "grad_norm": 3.9408669181284495e-06, "learning_rate": 1.4615384615384618e-06, "loss": 0.0, "step": 212600 }, { "epoch": 21.27, "grad_norm": 0.00014495612413156778, "learning_rate": 1.459866220735786e-06, "loss": 0.0, "step": 212700 }, { "epoch": 21.28, "grad_norm": 2.9439470381475985e-06, "learning_rate": 1.4581939799331105e-06, "loss": 0.0, "step": 212800 }, { "epoch": 21.29, "grad_norm": 0.00031986075919121504, "learning_rate": 1.4565217391304347e-06, "loss": 0.0, "step": 212900 }, { "epoch": 21.3, "grad_norm": 0.00012544303899630904, "learning_rate": 1.4548494983277592e-06, "loss": 0.0079, "step": 213000 }, { "epoch": 21.31, "grad_norm": 9.7873898994294e-06, "learning_rate": 1.4531772575250839e-06, "loss": 0.0, "step": 213100 }, { "epoch": 21.32, "grad_norm": 0.029412349686026573, "learning_rate": 1.4515050167224081e-06, "loss": 0.0, "step": 213200 }, { "epoch": 21.33, "grad_norm": 6.39492500340566e-05, "learning_rate": 1.4498327759197326e-06, "loss": 0.0, "step": 213300 }, { "epoch": 21.34, "grad_norm": 7.848659151932225e-06, "learning_rate": 1.4481605351170568e-06, "loss": 0.0, "step": 213400 }, { "epoch": 21.35, "grad_norm": 1.3351653251447715e-05, "learning_rate": 1.4464882943143813e-06, "loss": 0.0, "step": 213500 }, { "epoch": 21.36, "grad_norm": 0.0017086967127397656, "learning_rate": 1.444816053511706e-06, "loss": 0.0002, "step": 213600 }, { "epoch": 21.37, "grad_norm": 0.004947548732161522, "learning_rate": 1.4431438127090302e-06, "loss": 0.0, "step": 213700 }, { "epoch": 21.38, "grad_norm": 2.3903007786429953e-06, "learning_rate": 1.4414715719063547e-06, "loss": 0.0, "step": 213800 }, { "epoch": 21.39, "grad_norm": 0.00041224126471206546, "learning_rate": 1.439799331103679e-06, "loss": 0.0, "step": 213900 }, { "epoch": 21.4, "grad_norm": 0.0007366500794887543, "learning_rate": 1.4381270903010034e-06, "loss": 0.0028, "step": 214000 }, { "epoch": 21.41, "grad_norm": 2.2902284399606287e-05, "learning_rate": 1.436454849498328e-06, "loss": 0.0, "step": 214100 }, { "epoch": 21.42, "grad_norm": 1.2737849829136394e-05, "learning_rate": 1.4347826086956523e-06, "loss": 0.0039, "step": 214200 }, { "epoch": 21.43, "grad_norm": 0.001080359797924757, "learning_rate": 1.4331103678929768e-06, "loss": 0.0, "step": 214300 }, { "epoch": 21.44, "grad_norm": 4.485631507122889e-06, "learning_rate": 1.431438127090301e-06, "loss": 0.0, "step": 214400 }, { "epoch": 21.45, "grad_norm": 0.0007343352190218866, "learning_rate": 1.4297658862876255e-06, "loss": 0.0, "step": 214500 }, { "epoch": 21.46, "grad_norm": 9.769354073796421e-05, "learning_rate": 1.4280936454849501e-06, "loss": 0.0, "step": 214600 }, { "epoch": 21.47, "grad_norm": 0.001233136747032404, "learning_rate": 1.4264214046822744e-06, "loss": 0.0, "step": 214700 }, { "epoch": 21.48, "grad_norm": 4.587949206324993e-06, "learning_rate": 1.4247491638795989e-06, "loss": 0.0, "step": 214800 }, { "epoch": 21.49, "grad_norm": 3.4589143069752026e-06, "learning_rate": 1.423076923076923e-06, "loss": 0.0, "step": 214900 }, { "epoch": 21.5, "grad_norm": 6.62579077470582e-06, "learning_rate": 1.4214046822742476e-06, "loss": 0.0, "step": 215000 }, { "epoch": 21.5, "eval_accuracy": 0.987975, "eval_f1": 0.987975, "eval_loss": 0.12572382390499115, "eval_runtime": 133.9498, "eval_samples_per_second": 298.619, "eval_steps_per_second": 298.619, "step": 215000 }, { "epoch": 21.51, "grad_norm": 3.987763193435967e-06, "learning_rate": 1.4197324414715722e-06, "loss": 0.0, "step": 215100 }, { "epoch": 21.52, "grad_norm": 0.012753859162330627, "learning_rate": 1.4180602006688965e-06, "loss": 0.0, "step": 215200 }, { "epoch": 21.53, "grad_norm": 1.578287992742844e-05, "learning_rate": 1.416387959866221e-06, "loss": 0.0, "step": 215300 }, { "epoch": 21.54, "grad_norm": 2.9020106921961997e-06, "learning_rate": 1.4147157190635452e-06, "loss": 0.0, "step": 215400 }, { "epoch": 21.55, "grad_norm": 7.922858458186965e-06, "learning_rate": 1.4130434782608697e-06, "loss": 0.0, "step": 215500 }, { "epoch": 21.56, "grad_norm": 0.020123714581131935, "learning_rate": 1.4113712374581941e-06, "loss": 0.0023, "step": 215600 }, { "epoch": 21.57, "grad_norm": 6.435708201024681e-05, "learning_rate": 1.4096989966555186e-06, "loss": 0.0, "step": 215700 }, { "epoch": 21.58, "grad_norm": 3.7706938655901467e-06, "learning_rate": 1.408026755852843e-06, "loss": 0.0055, "step": 215800 }, { "epoch": 21.59, "grad_norm": 3.829626621154603e-06, "learning_rate": 1.4063545150501673e-06, "loss": 0.0, "step": 215900 }, { "epoch": 21.6, "grad_norm": 2.566429202488507e-06, "learning_rate": 1.4046822742474917e-06, "loss": 0.0, "step": 216000 }, { "epoch": 21.61, "grad_norm": 8.226523277699016e-06, "learning_rate": 1.4030100334448162e-06, "loss": 0.0, "step": 216100 }, { "epoch": 21.62, "grad_norm": 3.956181444664253e-06, "learning_rate": 1.4013377926421407e-06, "loss": 0.0, "step": 216200 }, { "epoch": 21.63, "grad_norm": 4.045846708322642e-06, "learning_rate": 1.399665551839465e-06, "loss": 0.0004, "step": 216300 }, { "epoch": 21.64, "grad_norm": 1.9764947865041904e-05, "learning_rate": 1.3979933110367894e-06, "loss": 0.0, "step": 216400 }, { "epoch": 21.65, "grad_norm": 2.6922571123577654e-06, "learning_rate": 1.3963210702341138e-06, "loss": 0.0, "step": 216500 }, { "epoch": 21.66, "grad_norm": 4.089896265213611e-06, "learning_rate": 1.3946488294314383e-06, "loss": 0.0, "step": 216600 }, { "epoch": 21.67, "grad_norm": 8.798033377388492e-05, "learning_rate": 1.3929765886287628e-06, "loss": 0.0, "step": 216700 }, { "epoch": 21.68, "grad_norm": 0.0005297737661749125, "learning_rate": 1.391304347826087e-06, "loss": 0.0, "step": 216800 }, { "epoch": 21.69, "grad_norm": 5.265828804112971e-06, "learning_rate": 1.3896321070234115e-06, "loss": 0.0, "step": 216900 }, { "epoch": 21.7, "grad_norm": 3.205472694389755e-06, "learning_rate": 1.3879598662207357e-06, "loss": 0.0028, "step": 217000 }, { "epoch": 21.71, "grad_norm": 3.2379621188738383e-06, "learning_rate": 1.3862876254180604e-06, "loss": 0.0, "step": 217100 }, { "epoch": 21.72, "grad_norm": 3.78774734599574e-06, "learning_rate": 1.3846153846153848e-06, "loss": 0.0025, "step": 217200 }, { "epoch": 21.73, "grad_norm": 4.6226919039327186e-06, "learning_rate": 1.382943143812709e-06, "loss": 0.0, "step": 217300 }, { "epoch": 21.74, "grad_norm": 4.582865221891552e-05, "learning_rate": 1.3812709030100336e-06, "loss": 0.0, "step": 217400 }, { "epoch": 21.75, "grad_norm": 3.6365586311148945e-06, "learning_rate": 1.3795986622073578e-06, "loss": 0.0, "step": 217500 }, { "epoch": 21.76, "grad_norm": 3.7290149066393496e-06, "learning_rate": 1.3779264214046825e-06, "loss": 0.0, "step": 217600 }, { "epoch": 21.77, "grad_norm": 3.5055440093856305e-05, "learning_rate": 1.376254180602007e-06, "loss": 0.0, "step": 217700 }, { "epoch": 21.78, "grad_norm": 0.0001472236035624519, "learning_rate": 1.3745819397993312e-06, "loss": 0.0, "step": 217800 }, { "epoch": 21.79, "grad_norm": 2.937935096269939e-05, "learning_rate": 1.3729096989966556e-06, "loss": 0.0, "step": 217900 }, { "epoch": 21.8, "grad_norm": 1.0437177479616366e-05, "learning_rate": 1.37123745819398e-06, "loss": 0.0, "step": 218000 }, { "epoch": 21.81, "grad_norm": 2.2088892364990897e-05, "learning_rate": 1.3695652173913044e-06, "loss": 0.0, "step": 218100 }, { "epoch": 21.82, "grad_norm": 3.875371112371795e-05, "learning_rate": 1.367892976588629e-06, "loss": 0.0, "step": 218200 }, { "epoch": 21.83, "grad_norm": 4.937228368362412e-06, "learning_rate": 1.3662207357859533e-06, "loss": 0.0, "step": 218300 }, { "epoch": 21.84, "grad_norm": 0.0001001003329292871, "learning_rate": 1.3645484949832777e-06, "loss": 0.0, "step": 218400 }, { "epoch": 21.85, "grad_norm": 1.2318441804382019e-05, "learning_rate": 1.362876254180602e-06, "loss": 0.0, "step": 218500 }, { "epoch": 21.86, "grad_norm": 2.1146149720152607e-06, "learning_rate": 1.3612040133779264e-06, "loss": 0.0, "step": 218600 }, { "epoch": 21.87, "grad_norm": 7.362048199865967e-05, "learning_rate": 1.3595317725752511e-06, "loss": 0.0, "step": 218700 }, { "epoch": 21.88, "grad_norm": 2.4402959297731286e-06, "learning_rate": 1.3578595317725754e-06, "loss": 0.0, "step": 218800 }, { "epoch": 21.89, "grad_norm": 2.172135054934188e-06, "learning_rate": 1.3561872909698998e-06, "loss": 0.0, "step": 218900 }, { "epoch": 21.9, "grad_norm": 7.556840955658117e-06, "learning_rate": 1.354515050167224e-06, "loss": 0.0039, "step": 219000 }, { "epoch": 21.91, "grad_norm": 2.4002990812732605e-06, "learning_rate": 1.3528428093645485e-06, "loss": 0.0, "step": 219100 }, { "epoch": 21.92, "grad_norm": 7.363054464804009e-05, "learning_rate": 1.3511705685618732e-06, "loss": 0.0, "step": 219200 }, { "epoch": 21.93, "grad_norm": 1.6851773807502468e-06, "learning_rate": 1.3494983277591975e-06, "loss": 0.0, "step": 219300 }, { "epoch": 21.94, "grad_norm": 4.769390216097236e-05, "learning_rate": 1.347826086956522e-06, "loss": 0.0, "step": 219400 }, { "epoch": 21.95, "grad_norm": 2.6172328944085166e-06, "learning_rate": 1.3461538461538462e-06, "loss": 0.0, "step": 219500 }, { "epoch": 21.96, "grad_norm": 2.637080115164281e-06, "learning_rate": 1.3444816053511706e-06, "loss": 0.0, "step": 219600 }, { "epoch": 21.97, "grad_norm": 2.824373495968757e-06, "learning_rate": 1.3428093645484953e-06, "loss": 0.0, "step": 219700 }, { "epoch": 21.98, "grad_norm": 3.1197271255223313e-06, "learning_rate": 1.3411371237458195e-06, "loss": 0.0, "step": 219800 }, { "epoch": 21.99, "grad_norm": 3.6116550745646236e-06, "learning_rate": 1.339464882943144e-06, "loss": 0.0, "step": 219900 }, { "epoch": 22.0, "grad_norm": 2.685248318812228e-06, "learning_rate": 1.3377926421404683e-06, "loss": 0.0, "step": 220000 }, { "epoch": 22.0, "eval_accuracy": 0.987325, "eval_f1": 0.987325, "eval_loss": 0.1481519639492035, "eval_runtime": 135.5655, "eval_samples_per_second": 295.06, "eval_steps_per_second": 295.06, "step": 220000 }, { "epoch": 22.01, "grad_norm": 6.039373602106934e-06, "learning_rate": 1.3361204013377927e-06, "loss": 0.0, "step": 220100 }, { "epoch": 22.02, "grad_norm": 4.358920341474004e-05, "learning_rate": 1.3344481605351172e-06, "loss": 0.0, "step": 220200 }, { "epoch": 22.03, "grad_norm": 2.7751359539252007e-06, "learning_rate": 1.3327759197324416e-06, "loss": 0.0, "step": 220300 }, { "epoch": 22.04, "grad_norm": 2.0891520762233995e-06, "learning_rate": 1.331103678929766e-06, "loss": 0.0, "step": 220400 }, { "epoch": 22.05, "grad_norm": 2.5228462163795484e-06, "learning_rate": 1.3294314381270903e-06, "loss": 0.0, "step": 220500 }, { "epoch": 22.06, "grad_norm": 0.00011139993148390204, "learning_rate": 1.3277591973244148e-06, "loss": 0.0053, "step": 220600 }, { "epoch": 22.07, "grad_norm": 6.763996680092532e-06, "learning_rate": 1.3260869565217393e-06, "loss": 0.0, "step": 220700 }, { "epoch": 22.08, "grad_norm": 0.0006400056299753487, "learning_rate": 1.3244147157190637e-06, "loss": 0.0, "step": 220800 }, { "epoch": 22.09, "grad_norm": 7.192987595772138e-06, "learning_rate": 1.322742474916388e-06, "loss": 0.0, "step": 220900 }, { "epoch": 22.1, "grad_norm": 1.966576974155032e-06, "learning_rate": 1.3210702341137124e-06, "loss": 0.0, "step": 221000 }, { "epoch": 22.11, "grad_norm": 8.61862918100087e-06, "learning_rate": 1.319397993311037e-06, "loss": 0.0, "step": 221100 }, { "epoch": 22.12, "grad_norm": 2.2725034796167165e-06, "learning_rate": 1.3177257525083614e-06, "loss": 0.0, "step": 221200 }, { "epoch": 22.13, "grad_norm": 0.0003781169361900538, "learning_rate": 1.3160535117056858e-06, "loss": 0.0, "step": 221300 }, { "epoch": 22.14, "grad_norm": 1.1108208127552643e-05, "learning_rate": 1.31438127090301e-06, "loss": 0.0, "step": 221400 }, { "epoch": 22.15, "grad_norm": 2.4818687052174937e-06, "learning_rate": 1.3127090301003345e-06, "loss": 0.0, "step": 221500 }, { "epoch": 22.16, "grad_norm": 2.578741487013758e-06, "learning_rate": 1.3110367892976588e-06, "loss": 0.0, "step": 221600 }, { "epoch": 22.17, "grad_norm": 1.6207087583097746e-06, "learning_rate": 1.3093645484949834e-06, "loss": 0.0, "step": 221700 }, { "epoch": 22.18, "grad_norm": 5.453099674923578e-06, "learning_rate": 1.307692307692308e-06, "loss": 0.0006, "step": 221800 }, { "epoch": 22.19, "grad_norm": 2.2827971406513825e-06, "learning_rate": 1.3060200668896322e-06, "loss": 0.0036, "step": 221900 }, { "epoch": 22.2, "grad_norm": 2.2465158053819323e-06, "learning_rate": 1.3043478260869566e-06, "loss": 0.0, "step": 222000 }, { "epoch": 22.21, "grad_norm": 2.7297505766910035e-06, "learning_rate": 1.3026755852842809e-06, "loss": 0.0, "step": 222100 }, { "epoch": 22.22, "grad_norm": 0.006246623583137989, "learning_rate": 1.3010033444816055e-06, "loss": 0.0, "step": 222200 }, { "epoch": 22.23, "grad_norm": 2.5728577384143136e-06, "learning_rate": 1.29933110367893e-06, "loss": 0.0, "step": 222300 }, { "epoch": 22.24, "grad_norm": 2.3373438580165384e-06, "learning_rate": 1.2976588628762542e-06, "loss": 0.0038, "step": 222400 }, { "epoch": 22.25, "grad_norm": 2.6464913389645517e-06, "learning_rate": 1.2959866220735787e-06, "loss": 0.0, "step": 222500 }, { "epoch": 22.26, "grad_norm": 5.262133709038608e-05, "learning_rate": 1.294314381270903e-06, "loss": 0.0, "step": 222600 }, { "epoch": 22.27, "grad_norm": 0.0036346414126455784, "learning_rate": 1.2926421404682276e-06, "loss": 0.0, "step": 222700 }, { "epoch": 22.28, "grad_norm": 1.1790768439823296e-05, "learning_rate": 1.290969899665552e-06, "loss": 0.0, "step": 222800 }, { "epoch": 22.29, "grad_norm": 1.9646349755930714e-06, "learning_rate": 1.2892976588628763e-06, "loss": 0.0, "step": 222900 }, { "epoch": 22.3, "grad_norm": 2.03367494577833e-06, "learning_rate": 1.2876254180602008e-06, "loss": 0.0, "step": 223000 }, { "epoch": 22.31, "grad_norm": 4.9398859118809924e-05, "learning_rate": 1.285953177257525e-06, "loss": 0.0, "step": 223100 }, { "epoch": 22.32, "grad_norm": 2.9453542538249167e-06, "learning_rate": 1.2842809364548495e-06, "loss": 0.0035, "step": 223200 }, { "epoch": 22.33, "grad_norm": 7.152600119297858e-06, "learning_rate": 1.2826086956521742e-06, "loss": 0.0, "step": 223300 }, { "epoch": 22.34, "grad_norm": 2.245857467642054e-05, "learning_rate": 1.2809364548494984e-06, "loss": 0.0, "step": 223400 }, { "epoch": 22.35, "grad_norm": 1.8094841607307899e-06, "learning_rate": 1.2792642140468229e-06, "loss": 0.0, "step": 223500 }, { "epoch": 22.36, "grad_norm": 2.0084596599190263e-06, "learning_rate": 1.2775919732441471e-06, "loss": 0.0, "step": 223600 }, { "epoch": 22.37, "grad_norm": 3.4688196137722116e-06, "learning_rate": 1.2759197324414716e-06, "loss": 0.0, "step": 223700 }, { "epoch": 22.38, "grad_norm": 123.31082916259766, "learning_rate": 1.2742474916387963e-06, "loss": 0.002, "step": 223800 }, { "epoch": 22.39, "grad_norm": 2.896066916946438e-06, "learning_rate": 1.2725752508361205e-06, "loss": 0.0043, "step": 223900 }, { "epoch": 22.4, "grad_norm": 0.04940846189856529, "learning_rate": 1.270903010033445e-06, "loss": 0.0, "step": 224000 }, { "epoch": 22.41, "grad_norm": 1.4823303899902385e-05, "learning_rate": 1.2692307692307692e-06, "loss": 0.0, "step": 224100 }, { "epoch": 22.42, "grad_norm": 2.5912493128998904e-06, "learning_rate": 1.2675585284280937e-06, "loss": 0.0, "step": 224200 }, { "epoch": 22.43, "grad_norm": 2.450393822073238e-06, "learning_rate": 1.2658862876254184e-06, "loss": 0.0, "step": 224300 }, { "epoch": 22.44, "grad_norm": 1.862587168943719e-06, "learning_rate": 1.2642140468227426e-06, "loss": 0.0, "step": 224400 }, { "epoch": 22.45, "grad_norm": 0.00016126820992212743, "learning_rate": 1.262541806020067e-06, "loss": 0.0, "step": 224500 }, { "epoch": 22.46, "grad_norm": 7.110831120371586e-06, "learning_rate": 1.2608695652173913e-06, "loss": 0.0, "step": 224600 }, { "epoch": 22.47, "grad_norm": 2.7295325253362535e-06, "learning_rate": 1.2591973244147158e-06, "loss": 0.0, "step": 224700 }, { "epoch": 22.48, "grad_norm": 1.2589208608915214e-06, "learning_rate": 1.2575250836120402e-06, "loss": 0.0033, "step": 224800 }, { "epoch": 22.49, "grad_norm": 5.134460934641538e-06, "learning_rate": 1.2558528428093647e-06, "loss": 0.0, "step": 224900 }, { "epoch": 22.5, "grad_norm": 3.236037809983827e-05, "learning_rate": 1.2541806020066892e-06, "loss": 0.0, "step": 225000 }, { "epoch": 22.5, "eval_accuracy": 0.98765, "eval_f1": 0.98765, "eval_loss": 0.14892929792404175, "eval_runtime": 136.0909, "eval_samples_per_second": 293.921, "eval_steps_per_second": 293.921, "step": 225000 }, { "epoch": 22.51, "grad_norm": 2.1141204342711717e-06, "learning_rate": 1.2525083612040134e-06, "loss": 0.0078, "step": 225100 }, { "epoch": 22.52, "grad_norm": 4.8540355237491895e-06, "learning_rate": 1.2508361204013379e-06, "loss": 0.0034, "step": 225200 }, { "epoch": 22.53, "grad_norm": 0.00014950527111068368, "learning_rate": 1.2491638795986623e-06, "loss": 0.0, "step": 225300 }, { "epoch": 22.54, "grad_norm": 2.4088958525680937e-06, "learning_rate": 1.2474916387959868e-06, "loss": 0.0, "step": 225400 }, { "epoch": 22.55, "grad_norm": 0.00022813983378000557, "learning_rate": 1.245819397993311e-06, "loss": 0.0, "step": 225500 }, { "epoch": 22.56, "grad_norm": 2.390265308349626e-06, "learning_rate": 1.2441471571906355e-06, "loss": 0.0, "step": 225600 }, { "epoch": 22.57, "grad_norm": 2.549254304540227e-06, "learning_rate": 1.24247491638796e-06, "loss": 0.0, "step": 225700 }, { "epoch": 22.58, "grad_norm": 6.035674687154824e-06, "learning_rate": 1.2408026755852844e-06, "loss": 0.0, "step": 225800 }, { "epoch": 22.59, "grad_norm": 1.8804727233145968e-06, "learning_rate": 1.2391304347826089e-06, "loss": 0.0, "step": 225900 }, { "epoch": 22.6, "grad_norm": 9.251437404600438e-06, "learning_rate": 1.2374581939799331e-06, "loss": 0.0, "step": 226000 }, { "epoch": 22.61, "grad_norm": 3.065869577767444e-06, "learning_rate": 1.2357859531772576e-06, "loss": 0.0, "step": 226100 }, { "epoch": 22.62, "grad_norm": 2.609942384879105e-06, "learning_rate": 1.234113712374582e-06, "loss": 0.0, "step": 226200 }, { "epoch": 22.63, "grad_norm": 1.9762251213251147e-06, "learning_rate": 1.2324414715719065e-06, "loss": 0.0, "step": 226300 }, { "epoch": 22.64, "grad_norm": 0.0001107776042772457, "learning_rate": 1.230769230769231e-06, "loss": 0.0, "step": 226400 }, { "epoch": 22.65, "grad_norm": 1.7592420817891252e-06, "learning_rate": 1.2290969899665552e-06, "loss": 0.0, "step": 226500 }, { "epoch": 22.66, "grad_norm": 4.1662101466499735e-06, "learning_rate": 1.2274247491638797e-06, "loss": 0.0, "step": 226600 }, { "epoch": 22.67, "grad_norm": 4.5451852201949805e-06, "learning_rate": 1.2257525083612041e-06, "loss": 0.0, "step": 226700 }, { "epoch": 22.68, "grad_norm": 6.946735811652616e-05, "learning_rate": 1.2240802675585286e-06, "loss": 0.0, "step": 226800 }, { "epoch": 22.69, "grad_norm": 1.3782610039925203e-06, "learning_rate": 1.2224080267558529e-06, "loss": 0.006, "step": 226900 }, { "epoch": 22.7, "grad_norm": 2.4639061848574784e-06, "learning_rate": 1.2207357859531773e-06, "loss": 0.0104, "step": 227000 }, { "epoch": 22.71, "grad_norm": 2.496316710676183e-06, "learning_rate": 1.2190635451505018e-06, "loss": 0.0, "step": 227100 }, { "epoch": 22.72, "grad_norm": 2.4078549358819146e-06, "learning_rate": 1.2173913043478262e-06, "loss": 0.0007, "step": 227200 }, { "epoch": 22.73, "grad_norm": 3.610181011026725e-06, "learning_rate": 1.2157190635451507e-06, "loss": 0.0047, "step": 227300 }, { "epoch": 22.74, "grad_norm": 2.6597438136377605e-06, "learning_rate": 1.214046822742475e-06, "loss": 0.0, "step": 227400 }, { "epoch": 22.75, "grad_norm": 2.876392727557686e-06, "learning_rate": 1.2123745819397994e-06, "loss": 0.0, "step": 227500 }, { "epoch": 22.76, "grad_norm": 6.276714429986896e-06, "learning_rate": 1.2107023411371239e-06, "loss": 0.0, "step": 227600 }, { "epoch": 22.77, "grad_norm": 1.3213903912401292e-05, "learning_rate": 1.2090301003344483e-06, "loss": 0.0, "step": 227700 }, { "epoch": 22.78, "grad_norm": 0.0025315505918115377, "learning_rate": 1.2073578595317726e-06, "loss": 0.0048, "step": 227800 }, { "epoch": 22.79, "grad_norm": 1.0337781532143708e-05, "learning_rate": 1.205685618729097e-06, "loss": 0.0, "step": 227900 }, { "epoch": 22.8, "grad_norm": 2.1591881704807747e-06, "learning_rate": 1.2040133779264215e-06, "loss": 0.0, "step": 228000 }, { "epoch": 22.81, "grad_norm": 4.338608050602488e-06, "learning_rate": 1.202341137123746e-06, "loss": 0.0, "step": 228100 }, { "epoch": 22.82, "grad_norm": 1.976111889234744e-06, "learning_rate": 1.2006688963210704e-06, "loss": 0.0, "step": 228200 }, { "epoch": 22.83, "grad_norm": 1.1604581231949851e-05, "learning_rate": 1.1989966555183947e-06, "loss": 0.0051, "step": 228300 }, { "epoch": 22.84, "grad_norm": 3.1730382943351287e-06, "learning_rate": 1.1973244147157191e-06, "loss": 0.0, "step": 228400 }, { "epoch": 22.85, "grad_norm": 2.0415036487975158e-06, "learning_rate": 1.1956521739130436e-06, "loss": 0.0006, "step": 228500 }, { "epoch": 22.86, "grad_norm": 7.62998479331145e-06, "learning_rate": 1.193979933110368e-06, "loss": 0.0, "step": 228600 }, { "epoch": 22.87, "grad_norm": 2.8338245101622306e-06, "learning_rate": 1.1923076923076925e-06, "loss": 0.0, "step": 228700 }, { "epoch": 22.88, "grad_norm": 6.511565970868105e-06, "learning_rate": 1.1906354515050168e-06, "loss": 0.0, "step": 228800 }, { "epoch": 22.89, "grad_norm": 1.0614480743242893e-05, "learning_rate": 1.1889632107023412e-06, "loss": 0.0, "step": 228900 }, { "epoch": 22.9, "grad_norm": 8.845283446135e-06, "learning_rate": 1.1872909698996657e-06, "loss": 0.0031, "step": 229000 }, { "epoch": 22.91, "grad_norm": 8.616744162281975e-05, "learning_rate": 1.1856187290969901e-06, "loss": 0.0, "step": 229100 }, { "epoch": 22.92, "grad_norm": 1.3663597201230004e-05, "learning_rate": 1.1839464882943144e-06, "loss": 0.0, "step": 229200 }, { "epoch": 22.93, "grad_norm": 5.31476234755246e-06, "learning_rate": 1.1822742474916388e-06, "loss": 0.0, "step": 229300 }, { "epoch": 22.94, "grad_norm": 1.5794164937688038e-05, "learning_rate": 1.1806020066889633e-06, "loss": 0.0, "step": 229400 }, { "epoch": 22.95, "grad_norm": 3.345578534208471e-06, "learning_rate": 1.1789297658862878e-06, "loss": 0.0, "step": 229500 }, { "epoch": 22.96, "grad_norm": 1.899161907203961e-05, "learning_rate": 1.1772575250836122e-06, "loss": 0.0, "step": 229600 }, { "epoch": 22.97, "grad_norm": 1.202254952659132e-05, "learning_rate": 1.1755852842809365e-06, "loss": 0.0, "step": 229700 }, { "epoch": 22.98, "grad_norm": 3.4707027225522324e-05, "learning_rate": 1.173913043478261e-06, "loss": 0.0, "step": 229800 }, { "epoch": 22.99, "grad_norm": 2.837147349055158e-06, "learning_rate": 1.1722408026755854e-06, "loss": 0.0, "step": 229900 }, { "epoch": 23.0, "grad_norm": 1.9231772512284806e-06, "learning_rate": 1.1705685618729099e-06, "loss": 0.0, "step": 230000 }, { "epoch": 23.0, "eval_accuracy": 0.988, "eval_f1": 0.988, "eval_loss": 0.13970816135406494, "eval_runtime": 138.5084, "eval_samples_per_second": 288.791, "eval_steps_per_second": 288.791, "step": 230000 }, { "epoch": 23.01, "grad_norm": 0.00043960308539681137, "learning_rate": 1.168896321070234e-06, "loss": 0.0, "step": 230100 }, { "epoch": 23.02, "grad_norm": 5.0748644753184635e-06, "learning_rate": 1.1672240802675586e-06, "loss": 0.0, "step": 230200 }, { "epoch": 23.03, "grad_norm": 6.698410288663581e-05, "learning_rate": 1.165551839464883e-06, "loss": 0.0, "step": 230300 }, { "epoch": 23.04, "grad_norm": 1.7563420442456845e-06, "learning_rate": 1.1638795986622075e-06, "loss": 0.0, "step": 230400 }, { "epoch": 23.05, "grad_norm": 2.1650689632224385e-06, "learning_rate": 1.162207357859532e-06, "loss": 0.0, "step": 230500 }, { "epoch": 23.06, "grad_norm": 2.9938364605186507e-06, "learning_rate": 1.1605351170568562e-06, "loss": 0.0, "step": 230600 }, { "epoch": 23.07, "grad_norm": 2.1382991235441295e-06, "learning_rate": 1.1588628762541807e-06, "loss": 0.0011, "step": 230700 }, { "epoch": 23.08, "grad_norm": 4.4728392822435126e-05, "learning_rate": 1.1571906354515051e-06, "loss": 0.0, "step": 230800 }, { "epoch": 23.09, "grad_norm": 1.2432237781467848e-05, "learning_rate": 1.1555183946488296e-06, "loss": 0.0, "step": 230900 }, { "epoch": 23.1, "grad_norm": 1.834149588830769e-05, "learning_rate": 1.153846153846154e-06, "loss": 0.0, "step": 231000 }, { "epoch": 23.11, "grad_norm": 7.334803740377538e-06, "learning_rate": 1.1521739130434783e-06, "loss": 0.0, "step": 231100 }, { "epoch": 23.12, "grad_norm": 3.3072183214244433e-06, "learning_rate": 1.1505016722408027e-06, "loss": 0.0001, "step": 231200 }, { "epoch": 23.13, "grad_norm": 0.0029784536454826593, "learning_rate": 1.1488294314381272e-06, "loss": 0.0049, "step": 231300 }, { "epoch": 23.14, "grad_norm": 0.00029570041806437075, "learning_rate": 1.1471571906354517e-06, "loss": 0.0, "step": 231400 }, { "epoch": 23.15, "grad_norm": 0.00011625168554019183, "learning_rate": 1.145484949832776e-06, "loss": 0.0, "step": 231500 }, { "epoch": 23.16, "grad_norm": 0.0046653710305690765, "learning_rate": 1.1438127090301004e-06, "loss": 0.0, "step": 231600 }, { "epoch": 23.17, "grad_norm": 6.324274636426708e-06, "learning_rate": 1.1421404682274248e-06, "loss": 0.0, "step": 231700 }, { "epoch": 23.18, "grad_norm": 5.813838924950687e-06, "learning_rate": 1.1404682274247493e-06, "loss": 0.0, "step": 231800 }, { "epoch": 23.19, "grad_norm": 1.855208552115073e-06, "learning_rate": 1.1387959866220738e-06, "loss": 0.0015, "step": 231900 }, { "epoch": 23.2, "grad_norm": 1.5933362647047034e-06, "learning_rate": 1.137123745819398e-06, "loss": 0.0, "step": 232000 }, { "epoch": 23.21, "grad_norm": 2.5436947908019647e-06, "learning_rate": 1.1354515050167225e-06, "loss": 0.0, "step": 232100 }, { "epoch": 23.22, "grad_norm": 1.4847254306005198e-06, "learning_rate": 1.133779264214047e-06, "loss": 0.0, "step": 232200 }, { "epoch": 23.23, "grad_norm": 2.0350166778371204e-06, "learning_rate": 1.1321070234113714e-06, "loss": 0.0, "step": 232300 }, { "epoch": 23.24, "grad_norm": 2.920028009611997e-06, "learning_rate": 1.1304347826086956e-06, "loss": 0.0054, "step": 232400 }, { "epoch": 23.25, "grad_norm": 2.4189600935642375e-06, "learning_rate": 1.12876254180602e-06, "loss": 0.0, "step": 232500 }, { "epoch": 23.26, "grad_norm": 4.0135386370820925e-05, "learning_rate": 1.1270903010033446e-06, "loss": 0.0, "step": 232600 }, { "epoch": 23.27, "grad_norm": 8.483904821332544e-06, "learning_rate": 1.125418060200669e-06, "loss": 0.0, "step": 232700 }, { "epoch": 23.28, "grad_norm": 1.1192724741704296e-05, "learning_rate": 1.1237458193979935e-06, "loss": 0.0, "step": 232800 }, { "epoch": 23.29, "grad_norm": 3.040320734726265e-06, "learning_rate": 1.1220735785953177e-06, "loss": 0.0, "step": 232900 }, { "epoch": 23.3, "grad_norm": 2.234099838460679e-06, "learning_rate": 1.1204013377926422e-06, "loss": 0.0, "step": 233000 }, { "epoch": 23.31, "grad_norm": 1.7016451465678983e-06, "learning_rate": 1.1187290969899666e-06, "loss": 0.0, "step": 233100 }, { "epoch": 23.32, "grad_norm": 2.1192927306401543e-06, "learning_rate": 1.1170568561872911e-06, "loss": 0.0013, "step": 233200 }, { "epoch": 23.33, "grad_norm": 3.3578889997443184e-06, "learning_rate": 1.1153846153846156e-06, "loss": 0.0, "step": 233300 }, { "epoch": 23.34, "grad_norm": 3.584066689654719e-06, "learning_rate": 1.1137123745819398e-06, "loss": 0.0, "step": 233400 }, { "epoch": 23.35, "grad_norm": 1.8127288967662025e-06, "learning_rate": 1.1120401337792643e-06, "loss": 0.0, "step": 233500 }, { "epoch": 23.36, "grad_norm": 1.8148879235013737e-06, "learning_rate": 1.1103678929765887e-06, "loss": 0.0, "step": 233600 }, { "epoch": 23.37, "grad_norm": 1.7151642168755643e-05, "learning_rate": 1.1086956521739132e-06, "loss": 0.0, "step": 233700 }, { "epoch": 23.38, "grad_norm": 2.9488589916581986e-06, "learning_rate": 1.1070234113712377e-06, "loss": 0.0, "step": 233800 }, { "epoch": 23.39, "grad_norm": 3.188793925801292e-06, "learning_rate": 1.105351170568562e-06, "loss": 0.0, "step": 233900 }, { "epoch": 23.4, "grad_norm": 3.0367280487553217e-05, "learning_rate": 1.1036789297658864e-06, "loss": 0.0, "step": 234000 }, { "epoch": 23.41, "grad_norm": 0.0007931198342703283, "learning_rate": 1.1020066889632108e-06, "loss": 0.0, "step": 234100 }, { "epoch": 23.42, "grad_norm": 0.0004764864861499518, "learning_rate": 1.1003344481605353e-06, "loss": 0.0, "step": 234200 }, { "epoch": 23.43, "grad_norm": 2.6186989998677745e-06, "learning_rate": 1.0986622073578595e-06, "loss": 0.0068, "step": 234300 }, { "epoch": 23.44, "grad_norm": 1.6648232303850818e-06, "learning_rate": 1.096989966555184e-06, "loss": 0.0066, "step": 234400 }, { "epoch": 23.45, "grad_norm": 2.701480980249471e-06, "learning_rate": 1.0953177257525085e-06, "loss": 0.0, "step": 234500 }, { "epoch": 23.46, "grad_norm": 2.0431327811820665e-06, "learning_rate": 1.093645484949833e-06, "loss": 0.0, "step": 234600 }, { "epoch": 23.47, "grad_norm": 4.373225237941369e-06, "learning_rate": 1.0919732441471572e-06, "loss": 0.0, "step": 234700 }, { "epoch": 23.48, "grad_norm": 1.955356310645584e-06, "learning_rate": 1.0903010033444816e-06, "loss": 0.0, "step": 234800 }, { "epoch": 23.49, "grad_norm": 3.090561449425877e-06, "learning_rate": 1.088628762541806e-06, "loss": 0.0, "step": 234900 }, { "epoch": 23.5, "grad_norm": 2.478899114066735e-05, "learning_rate": 1.0869565217391306e-06, "loss": 0.0, "step": 235000 }, { "epoch": 23.5, "eval_accuracy": 0.9877, "eval_f1": 0.9877, "eval_loss": 0.14368464052677155, "eval_runtime": 137.2609, "eval_samples_per_second": 291.416, "eval_steps_per_second": 291.416, "step": 235000 }, { "epoch": 23.51, "grad_norm": 2.0134564238105668e-06, "learning_rate": 1.085284280936455e-06, "loss": 0.0, "step": 235100 }, { "epoch": 23.52, "grad_norm": 0.00012607380631379783, "learning_rate": 1.0836120401337793e-06, "loss": 0.0, "step": 235200 }, { "epoch": 23.53, "grad_norm": 3.1202098398352973e-06, "learning_rate": 1.0819397993311037e-06, "loss": 0.0, "step": 235300 }, { "epoch": 23.54, "grad_norm": 3.076041821259423e-06, "learning_rate": 1.0802675585284282e-06, "loss": 0.0, "step": 235400 }, { "epoch": 23.55, "grad_norm": 2.8738932087435387e-06, "learning_rate": 1.0785953177257526e-06, "loss": 0.0036, "step": 235500 }, { "epoch": 23.56, "grad_norm": 1.94904441741528e-06, "learning_rate": 1.076923076923077e-06, "loss": 0.0, "step": 235600 }, { "epoch": 23.57, "grad_norm": 8.461877769150306e-06, "learning_rate": 1.0752508361204014e-06, "loss": 0.0, "step": 235700 }, { "epoch": 23.58, "grad_norm": 8.348670235136524e-06, "learning_rate": 1.0735785953177258e-06, "loss": 0.0, "step": 235800 }, { "epoch": 23.59, "grad_norm": 3.347354549987358e-06, "learning_rate": 1.0719063545150503e-06, "loss": 0.0, "step": 235900 }, { "epoch": 23.6, "grad_norm": 2.0461826352402568e-05, "learning_rate": 1.0702341137123747e-06, "loss": 0.0, "step": 236000 }, { "epoch": 23.61, "grad_norm": 4.463665391085669e-06, "learning_rate": 1.0685618729096992e-06, "loss": 0.0, "step": 236100 }, { "epoch": 23.62, "grad_norm": 2.9481900583050447e-06, "learning_rate": 1.0668896321070234e-06, "loss": 0.0, "step": 236200 }, { "epoch": 23.63, "grad_norm": 1.6733443771954626e-05, "learning_rate": 1.065217391304348e-06, "loss": 0.0, "step": 236300 }, { "epoch": 23.64, "grad_norm": 2.97983660857426e-06, "learning_rate": 1.0635451505016724e-06, "loss": 0.0, "step": 236400 }, { "epoch": 23.65, "grad_norm": 4.343262844486162e-06, "learning_rate": 1.0618729096989968e-06, "loss": 0.0, "step": 236500 }, { "epoch": 23.66, "grad_norm": 1.3636974927067058e-06, "learning_rate": 1.060200668896321e-06, "loss": 0.0005, "step": 236600 }, { "epoch": 23.67, "grad_norm": 1.4470730320681469e-06, "learning_rate": 1.0585284280936455e-06, "loss": 0.0, "step": 236700 }, { "epoch": 23.68, "grad_norm": 0.0001465627137804404, "learning_rate": 1.05685618729097e-06, "loss": 0.0, "step": 236800 }, { "epoch": 23.69, "grad_norm": 5.631826934404671e-05, "learning_rate": 1.0551839464882945e-06, "loss": 0.0, "step": 236900 }, { "epoch": 23.7, "grad_norm": 5.150936885911506e-06, "learning_rate": 1.0535117056856187e-06, "loss": 0.0, "step": 237000 }, { "epoch": 23.71, "grad_norm": 1.5397962442875723e-06, "learning_rate": 1.0518394648829432e-06, "loss": 0.0, "step": 237100 }, { "epoch": 23.72, "grad_norm": 1.8038625739791314e-06, "learning_rate": 1.0501672240802676e-06, "loss": 0.0031, "step": 237200 }, { "epoch": 23.73, "grad_norm": 1.82100211532088e-05, "learning_rate": 1.048494983277592e-06, "loss": 0.0, "step": 237300 }, { "epoch": 23.74, "grad_norm": 1.540220296192274e-06, "learning_rate": 1.0468227424749165e-06, "loss": 0.0, "step": 237400 }, { "epoch": 23.75, "grad_norm": 3.610687826949288e-06, "learning_rate": 1.0451505016722408e-06, "loss": 0.0, "step": 237500 }, { "epoch": 23.76, "grad_norm": 2.127660991391167e-06, "learning_rate": 1.0434782608695653e-06, "loss": 0.0, "step": 237600 }, { "epoch": 23.77, "grad_norm": 1.2974899163964437e-06, "learning_rate": 1.0418060200668897e-06, "loss": 0.0009, "step": 237700 }, { "epoch": 23.78, "grad_norm": 2.5898298190440983e-06, "learning_rate": 1.0401337792642142e-06, "loss": 0.0, "step": 237800 }, { "epoch": 23.79, "grad_norm": 3.946627657569479e-06, "learning_rate": 1.0384615384615386e-06, "loss": 0.0, "step": 237900 }, { "epoch": 23.8, "grad_norm": 8.825607801554725e-05, "learning_rate": 1.0367892976588629e-06, "loss": 0.0, "step": 238000 }, { "epoch": 23.81, "grad_norm": 1.747809960761515e-06, "learning_rate": 1.0351170568561873e-06, "loss": 0.0, "step": 238100 }, { "epoch": 23.82, "grad_norm": 7.072359949233942e-06, "learning_rate": 1.0334448160535118e-06, "loss": 0.0004, "step": 238200 }, { "epoch": 23.83, "grad_norm": 4.897616690868745e-06, "learning_rate": 1.0317725752508363e-06, "loss": 0.0, "step": 238300 }, { "epoch": 23.84, "grad_norm": 2.8052606921846746e-06, "learning_rate": 1.0301003344481607e-06, "loss": 0.0, "step": 238400 }, { "epoch": 23.85, "grad_norm": 1.638777007428871e-06, "learning_rate": 1.028428093645485e-06, "loss": 0.0, "step": 238500 }, { "epoch": 23.86, "grad_norm": 3.3246594739466673e-06, "learning_rate": 1.0267558528428094e-06, "loss": 0.0, "step": 238600 }, { "epoch": 23.87, "grad_norm": 8.469135536870454e-06, "learning_rate": 1.025083612040134e-06, "loss": 0.0, "step": 238700 }, { "epoch": 23.88, "grad_norm": 1.0242590633424697e-06, "learning_rate": 1.0234113712374584e-06, "loss": 0.0, "step": 238800 }, { "epoch": 23.89, "grad_norm": 2.301003405591473e-06, "learning_rate": 1.0217391304347828e-06, "loss": 0.0, "step": 238900 }, { "epoch": 23.9, "grad_norm": 1.0797813274621149e-06, "learning_rate": 1.020066889632107e-06, "loss": 0.0, "step": 239000 }, { "epoch": 23.91, "grad_norm": 9.672876331023872e-06, "learning_rate": 1.0183946488294315e-06, "loss": 0.0, "step": 239100 }, { "epoch": 23.92, "grad_norm": 2.800751190079609e-06, "learning_rate": 1.016722408026756e-06, "loss": 0.0, "step": 239200 }, { "epoch": 23.93, "grad_norm": 9.375202694172913e-07, "learning_rate": 1.0150501672240804e-06, "loss": 0.0, "step": 239300 }, { "epoch": 23.94, "grad_norm": 0.005259317811578512, "learning_rate": 1.0133779264214047e-06, "loss": 0.0, "step": 239400 }, { "epoch": 23.95, "grad_norm": 4.873456418863498e-06, "learning_rate": 1.0117056856187292e-06, "loss": 0.0, "step": 239500 }, { "epoch": 23.96, "grad_norm": 1.0279156867909478e-06, "learning_rate": 1.0100334448160536e-06, "loss": 0.0, "step": 239600 }, { "epoch": 23.97, "grad_norm": 1.5694160993007245e-06, "learning_rate": 1.008361204013378e-06, "loss": 0.0, "step": 239700 }, { "epoch": 23.98, "grad_norm": 5.572123882302549e-06, "learning_rate": 1.0066889632107023e-06, "loss": 0.0051, "step": 239800 }, { "epoch": 23.99, "grad_norm": 8.108318979793694e-06, "learning_rate": 1.0050167224080268e-06, "loss": 0.0037, "step": 239900 }, { "epoch": 24.0, "grad_norm": 1.9160975170962047e-06, "learning_rate": 1.0033444816053512e-06, "loss": 0.0, "step": 240000 }, { "epoch": 24.0, "eval_accuracy": 0.9875, "eval_f1": 0.9875, "eval_loss": 0.15013839304447174, "eval_runtime": 135.7089, "eval_samples_per_second": 294.749, "eval_steps_per_second": 294.749, "step": 240000 }, { "epoch": 24.01, "grad_norm": 2.217173687313334e-06, "learning_rate": 1.0016722408026757e-06, "loss": 0.0, "step": 240100 }, { "epoch": 24.02, "grad_norm": 8.573334343964234e-05, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "step": 240200 }, { "epoch": 24.03, "grad_norm": 2.2610554424318252e-06, "learning_rate": 9.983277591973244e-07, "loss": 0.0, "step": 240300 }, { "epoch": 24.04, "grad_norm": 3.525610736687668e-05, "learning_rate": 9.966555183946489e-07, "loss": 0.002, "step": 240400 }, { "epoch": 24.05, "grad_norm": 1.1360249345671036e-06, "learning_rate": 9.949832775919733e-07, "loss": 0.0, "step": 240500 }, { "epoch": 24.06, "grad_norm": 1.7239157159565366e-06, "learning_rate": 9.933110367892978e-07, "loss": 0.0028, "step": 240600 }, { "epoch": 24.07, "grad_norm": 3.1304966796597e-06, "learning_rate": 9.916387959866223e-07, "loss": 0.0074, "step": 240700 }, { "epoch": 24.08, "grad_norm": 1.5380784361695987e-06, "learning_rate": 9.899665551839465e-07, "loss": 0.001, "step": 240800 }, { "epoch": 24.09, "grad_norm": 5.637221875076648e-06, "learning_rate": 9.88294314381271e-07, "loss": 0.0, "step": 240900 }, { "epoch": 24.1, "grad_norm": 1.9916706150979735e-06, "learning_rate": 9.866220735785954e-07, "loss": 0.0, "step": 241000 }, { "epoch": 24.11, "grad_norm": 1.8978020079885027e-06, "learning_rate": 9.849498327759199e-07, "loss": 0.0, "step": 241100 }, { "epoch": 24.12, "grad_norm": 2.0716247490781825e-06, "learning_rate": 9.832775919732443e-07, "loss": 0.0, "step": 241200 }, { "epoch": 24.13, "grad_norm": 1.9153335415467154e-06, "learning_rate": 9.816053511705686e-07, "loss": 0.0, "step": 241300 }, { "epoch": 24.14, "grad_norm": 1.0115123814102844e-06, "learning_rate": 9.79933110367893e-07, "loss": 0.0, "step": 241400 }, { "epoch": 24.15, "grad_norm": 1.018683747133764e-06, "learning_rate": 9.782608695652175e-07, "loss": 0.0018, "step": 241500 }, { "epoch": 24.16, "grad_norm": 2.7036819574277615e-06, "learning_rate": 9.76588628762542e-07, "loss": 0.0, "step": 241600 }, { "epoch": 24.17, "grad_norm": 6.6156862885691226e-06, "learning_rate": 9.749163879598662e-07, "loss": 0.0, "step": 241700 }, { "epoch": 24.18, "grad_norm": 2.5971964987547835e-06, "learning_rate": 9.732441471571907e-07, "loss": 0.0, "step": 241800 }, { "epoch": 24.19, "grad_norm": 1.5181810795183992e-06, "learning_rate": 9.715719063545151e-07, "loss": 0.0, "step": 241900 }, { "epoch": 24.2, "grad_norm": 1.7683860278339125e-06, "learning_rate": 9.698996655518396e-07, "loss": 0.0, "step": 242000 }, { "epoch": 24.21, "grad_norm": 3.6412154713616474e-06, "learning_rate": 9.682274247491639e-07, "loss": 0.0004, "step": 242100 }, { "epoch": 24.22, "grad_norm": 1.359268935630098e-06, "learning_rate": 9.665551839464883e-07, "loss": 0.0, "step": 242200 }, { "epoch": 24.23, "grad_norm": 3.5012128591915825e-06, "learning_rate": 9.648829431438128e-07, "loss": 0.0, "step": 242300 }, { "epoch": 24.24, "grad_norm": 3.7246281863190234e-06, "learning_rate": 9.632107023411372e-07, "loss": 0.0, "step": 242400 }, { "epoch": 24.25, "grad_norm": 5.6625767683726735e-06, "learning_rate": 9.615384615384617e-07, "loss": 0.0, "step": 242500 }, { "epoch": 24.26, "grad_norm": 1.992144916584948e-06, "learning_rate": 9.59866220735786e-07, "loss": 0.0, "step": 242600 }, { "epoch": 24.27, "grad_norm": 2.249276576549164e-06, "learning_rate": 9.581939799331104e-07, "loss": 0.0062, "step": 242700 }, { "epoch": 24.28, "grad_norm": 1.3135835388311534e-06, "learning_rate": 9.565217391304349e-07, "loss": 0.0, "step": 242800 }, { "epoch": 24.29, "grad_norm": 1.8054714701065677e-06, "learning_rate": 9.548494983277593e-07, "loss": 0.0, "step": 242900 }, { "epoch": 24.3, "grad_norm": 3.5502798709785566e-06, "learning_rate": 9.531772575250838e-07, "loss": 0.0, "step": 243000 }, { "epoch": 24.31, "grad_norm": 2.3826137294236105e-06, "learning_rate": 9.515050167224081e-07, "loss": 0.0, "step": 243100 }, { "epoch": 24.32, "grad_norm": 1.0196755283686798e-05, "learning_rate": 9.498327759197325e-07, "loss": 0.0, "step": 243200 }, { "epoch": 24.33, "grad_norm": 1.386860390084621e-06, "learning_rate": 9.48160535117057e-07, "loss": 0.0, "step": 243300 }, { "epoch": 24.34, "grad_norm": 1.7781057977117598e-06, "learning_rate": 9.464882943143813e-07, "loss": 0.0, "step": 243400 }, { "epoch": 24.35, "grad_norm": 1.5628019127689186e-06, "learning_rate": 9.448160535117059e-07, "loss": 0.0, "step": 243500 }, { "epoch": 24.36, "grad_norm": 9.654933819547296e-06, "learning_rate": 9.431438127090302e-07, "loss": 0.0, "step": 243600 }, { "epoch": 24.37, "grad_norm": 2.1423722955660196e-06, "learning_rate": 9.414715719063546e-07, "loss": 0.0, "step": 243700 }, { "epoch": 24.38, "grad_norm": 2.166335207220982e-06, "learning_rate": 9.39799331103679e-07, "loss": 0.0, "step": 243800 }, { "epoch": 24.39, "grad_norm": 1.555784479023714e-06, "learning_rate": 9.381270903010034e-07, "loss": 0.0011, "step": 243900 }, { "epoch": 24.4, "grad_norm": 2.7639248401101213e-06, "learning_rate": 9.364548494983278e-07, "loss": 0.0056, "step": 244000 }, { "epoch": 24.41, "grad_norm": 1.255690335710824e-06, "learning_rate": 9.347826086956522e-07, "loss": 0.0, "step": 244100 }, { "epoch": 24.42, "grad_norm": 1.6317754898409476e-06, "learning_rate": 9.331103678929766e-07, "loss": 0.0, "step": 244200 }, { "epoch": 24.43, "grad_norm": 5.905946818529628e-05, "learning_rate": 9.314381270903011e-07, "loss": 0.0, "step": 244300 }, { "epoch": 24.44, "grad_norm": 6.9295992943807505e-06, "learning_rate": 9.297658862876255e-07, "loss": 0.0, "step": 244400 }, { "epoch": 24.45, "grad_norm": 6.23083451500861e-06, "learning_rate": 9.280936454849498e-07, "loss": 0.009, "step": 244500 }, { "epoch": 24.46, "grad_norm": 2.21546838474751e-06, "learning_rate": 9.264214046822743e-07, "loss": 0.0, "step": 244600 }, { "epoch": 24.47, "grad_norm": 1.5623327271896414e-06, "learning_rate": 9.247491638795987e-07, "loss": 0.0, "step": 244700 }, { "epoch": 24.48, "grad_norm": 1.841994708229322e-05, "learning_rate": 9.230769230769232e-07, "loss": 0.0, "step": 244800 }, { "epoch": 24.49, "grad_norm": 1.967902562682866e-06, "learning_rate": 9.214046822742476e-07, "loss": 0.0, "step": 244900 }, { "epoch": 24.5, "grad_norm": 7.149414159357548e-05, "learning_rate": 9.197324414715719e-07, "loss": 0.0, "step": 245000 }, { "epoch": 24.5, "eval_accuracy": 0.987675, "eval_f1": 0.987675, "eval_loss": 0.14463376998901367, "eval_runtime": 133.7499, "eval_samples_per_second": 299.066, "eval_steps_per_second": 299.066, "step": 245000 }, { "epoch": 24.51, "grad_norm": 2.040028903138591e-06, "learning_rate": 9.180602006688964e-07, "loss": 0.0, "step": 245100 }, { "epoch": 24.52, "grad_norm": 8.409876500081737e-06, "learning_rate": 9.163879598662208e-07, "loss": 0.0, "step": 245200 }, { "epoch": 24.53, "grad_norm": 0.0001772494870238006, "learning_rate": 9.147157190635453e-07, "loss": 0.0, "step": 245300 }, { "epoch": 24.54, "grad_norm": 1.979961780307349e-06, "learning_rate": 9.130434782608697e-07, "loss": 0.0, "step": 245400 }, { "epoch": 24.55, "grad_norm": 1.6618955669400748e-06, "learning_rate": 9.11371237458194e-07, "loss": 0.0, "step": 245500 }, { "epoch": 24.56, "grad_norm": 9.364392212773964e-07, "learning_rate": 9.096989966555185e-07, "loss": 0.0, "step": 245600 }, { "epoch": 24.57, "grad_norm": 1.2160958249296527e-06, "learning_rate": 9.080267558528428e-07, "loss": 0.0007, "step": 245700 }, { "epoch": 24.58, "grad_norm": 7.665908015042078e-06, "learning_rate": 9.063545150501674e-07, "loss": 0.0, "step": 245800 }, { "epoch": 24.59, "grad_norm": 8.277664892375469e-05, "learning_rate": 9.046822742474918e-07, "loss": 0.0, "step": 245900 }, { "epoch": 24.6, "grad_norm": 3.3383425943611655e-06, "learning_rate": 9.030100334448161e-07, "loss": 0.0, "step": 246000 }, { "epoch": 24.61, "grad_norm": 1.6881747342267772e-06, "learning_rate": 9.013377926421406e-07, "loss": 0.0, "step": 246100 }, { "epoch": 24.62, "grad_norm": 1.5223195077851415e-06, "learning_rate": 8.996655518394649e-07, "loss": 0.0, "step": 246200 }, { "epoch": 24.63, "grad_norm": 1.902065378089901e-06, "learning_rate": 8.979933110367894e-07, "loss": 0.0047, "step": 246300 }, { "epoch": 24.64, "grad_norm": 2.0098498225706862e-06, "learning_rate": 8.963210702341138e-07, "loss": 0.0, "step": 246400 }, { "epoch": 24.65, "grad_norm": 1.1443703442637343e-05, "learning_rate": 8.946488294314382e-07, "loss": 0.0, "step": 246500 }, { "epoch": 24.66, "grad_norm": 4.432857167557813e-06, "learning_rate": 8.929765886287627e-07, "loss": 0.0, "step": 246600 }, { "epoch": 24.67, "grad_norm": 1.3005646906094626e-06, "learning_rate": 8.91304347826087e-07, "loss": 0.0, "step": 246700 }, { "epoch": 24.68, "grad_norm": 2.724513251450844e-06, "learning_rate": 8.896321070234114e-07, "loss": 0.0, "step": 246800 }, { "epoch": 24.69, "grad_norm": 1.3002398191019893e-05, "learning_rate": 8.879598662207358e-07, "loss": 0.0, "step": 246900 }, { "epoch": 24.7, "grad_norm": 3.660285074147396e-06, "learning_rate": 8.862876254180602e-07, "loss": 0.0, "step": 247000 }, { "epoch": 24.71, "grad_norm": 1.4119721527094953e-05, "learning_rate": 8.846153846153848e-07, "loss": 0.0, "step": 247100 }, { "epoch": 24.72, "grad_norm": 1.8244367083752877e-06, "learning_rate": 8.829431438127091e-07, "loss": 0.0, "step": 247200 }, { "epoch": 24.73, "grad_norm": 2.2953408915782347e-06, "learning_rate": 8.812709030100335e-07, "loss": 0.0, "step": 247300 }, { "epoch": 24.74, "grad_norm": 1.3426415534922853e-06, "learning_rate": 8.795986622073579e-07, "loss": 0.0045, "step": 247400 }, { "epoch": 24.75, "grad_norm": 2.1255984393064864e-05, "learning_rate": 8.779264214046823e-07, "loss": 0.0, "step": 247500 }, { "epoch": 24.76, "grad_norm": 1.6822939414851135e-06, "learning_rate": 8.762541806020069e-07, "loss": 0.0, "step": 247600 }, { "epoch": 24.77, "grad_norm": 2.1174896573938895e-06, "learning_rate": 8.745819397993312e-07, "loss": 0.0, "step": 247700 }, { "epoch": 24.78, "grad_norm": 1.8592134438222274e-05, "learning_rate": 8.729096989966556e-07, "loss": 0.0, "step": 247800 }, { "epoch": 24.79, "grad_norm": 2.290886186528951e-06, "learning_rate": 8.7123745819398e-07, "loss": 0.0, "step": 247900 }, { "epoch": 24.8, "grad_norm": 1.563448222441366e-06, "learning_rate": 8.695652173913044e-07, "loss": 0.0, "step": 248000 }, { "epoch": 24.81, "grad_norm": 1.2164559848315548e-06, "learning_rate": 8.678929765886289e-07, "loss": 0.0028, "step": 248100 }, { "epoch": 24.82, "grad_norm": 1.7395184386259643e-06, "learning_rate": 8.662207357859533e-07, "loss": 0.0001, "step": 248200 }, { "epoch": 24.83, "grad_norm": 1.5021948911453364e-06, "learning_rate": 8.645484949832777e-07, "loss": 0.0, "step": 248300 }, { "epoch": 24.84, "grad_norm": 1.945095027622301e-05, "learning_rate": 8.628762541806021e-07, "loss": 0.0, "step": 248400 }, { "epoch": 24.85, "grad_norm": 1.636931528992136e-06, "learning_rate": 8.612040133779265e-07, "loss": 0.0, "step": 248500 }, { "epoch": 24.86, "grad_norm": 2.2785177407058654e-06, "learning_rate": 8.595317725752509e-07, "loss": 0.0, "step": 248600 }, { "epoch": 24.87, "grad_norm": 1.2720621498374385e-06, "learning_rate": 8.578595317725753e-07, "loss": 0.0, "step": 248700 }, { "epoch": 24.88, "grad_norm": 0.0009492366807535291, "learning_rate": 8.561872909698997e-07, "loss": 0.0, "step": 248800 }, { "epoch": 24.89, "grad_norm": 1.7387096704624128e-06, "learning_rate": 8.545150501672242e-07, "loss": 0.0, "step": 248900 }, { "epoch": 24.9, "grad_norm": 3.0261933261499507e-06, "learning_rate": 8.528428093645486e-07, "loss": 0.0, "step": 249000 }, { "epoch": 24.91, "grad_norm": 1.878150214906782e-06, "learning_rate": 8.511705685618729e-07, "loss": 0.0, "step": 249100 }, { "epoch": 24.92, "grad_norm": 4.968315806763712e-06, "learning_rate": 8.494983277591974e-07, "loss": 0.0, "step": 249200 }, { "epoch": 24.93, "grad_norm": 1.8417338196741184e-06, "learning_rate": 8.478260869565217e-07, "loss": 0.0, "step": 249300 }, { "epoch": 24.94, "grad_norm": 1.265710011466581e-06, "learning_rate": 8.461538461538463e-07, "loss": 0.0, "step": 249400 }, { "epoch": 24.95, "grad_norm": 1.504531383034191e-06, "learning_rate": 8.444816053511706e-07, "loss": 0.0, "step": 249500 }, { "epoch": 24.96, "grad_norm": 8.036552753765136e-05, "learning_rate": 8.42809364548495e-07, "loss": 0.0, "step": 249600 }, { "epoch": 24.97, "grad_norm": 1.7299427099715103e-06, "learning_rate": 8.411371237458195e-07, "loss": 0.0, "step": 249700 }, { "epoch": 24.98, "grad_norm": 9.004021194414236e-06, "learning_rate": 8.394648829431438e-07, "loss": 0.0, "step": 249800 }, { "epoch": 24.99, "grad_norm": 1.3094919495415525e-06, "learning_rate": 8.377926421404684e-07, "loss": 0.0009, "step": 249900 }, { "epoch": 25.0, "grad_norm": 3.1047757147462107e-06, "learning_rate": 8.361204013377927e-07, "loss": 0.0, "step": 250000 }, { "epoch": 25.0, "eval_accuracy": 0.9874, "eval_f1": 0.9874, "eval_loss": 0.15316404402256012, "eval_runtime": 134.4883, "eval_samples_per_second": 297.424, "eval_steps_per_second": 297.424, "step": 250000 }, { "epoch": 25.01, "grad_norm": 1.1634468819465837e-06, "learning_rate": 8.344481605351171e-07, "loss": 0.0, "step": 250100 }, { "epoch": 25.02, "grad_norm": 2.2065494249545736e-06, "learning_rate": 8.327759197324416e-07, "loss": 0.0, "step": 250200 }, { "epoch": 25.03, "grad_norm": 5.784725544799585e-06, "learning_rate": 8.311036789297659e-07, "loss": 0.0, "step": 250300 }, { "epoch": 25.04, "grad_norm": 0.0002647130750119686, "learning_rate": 8.294314381270905e-07, "loss": 0.0002, "step": 250400 }, { "epoch": 25.05, "grad_norm": 6.513052994705504e-07, "learning_rate": 8.277591973244148e-07, "loss": 0.0, "step": 250500 }, { "epoch": 25.06, "grad_norm": 3.345231334606069e-06, "learning_rate": 8.260869565217392e-07, "loss": 0.0, "step": 250600 }, { "epoch": 25.07, "grad_norm": 7.839264981157612e-06, "learning_rate": 8.244147157190636e-07, "loss": 0.0, "step": 250700 }, { "epoch": 25.08, "grad_norm": 5.135179435455939e-06, "learning_rate": 8.22742474916388e-07, "loss": 0.0, "step": 250800 }, { "epoch": 25.09, "grad_norm": 1.4390598153113388e-05, "learning_rate": 8.210702341137125e-07, "loss": 0.0, "step": 250900 }, { "epoch": 25.1, "grad_norm": 8.922448273551709e-07, "learning_rate": 8.193979933110368e-07, "loss": 0.0, "step": 251000 }, { "epoch": 25.11, "grad_norm": 2.3406830678140977e-06, "learning_rate": 8.177257525083613e-07, "loss": 0.0052, "step": 251100 }, { "epoch": 25.12, "grad_norm": 6.374452254931384e-07, "learning_rate": 8.160535117056857e-07, "loss": 0.0, "step": 251200 }, { "epoch": 25.13, "grad_norm": 3.375598680577241e-06, "learning_rate": 8.143812709030101e-07, "loss": 0.0, "step": 251300 }, { "epoch": 25.14, "grad_norm": 1.8560887156127137e-06, "learning_rate": 8.127090301003346e-07, "loss": 0.0, "step": 251400 }, { "epoch": 25.15, "grad_norm": 1.5140713003347628e-06, "learning_rate": 8.110367892976589e-07, "loss": 0.0, "step": 251500 }, { "epoch": 25.16, "grad_norm": 3.200523497071117e-05, "learning_rate": 8.093645484949833e-07, "loss": 0.0094, "step": 251600 }, { "epoch": 25.17, "grad_norm": 1.8809379980666563e-05, "learning_rate": 8.076923076923078e-07, "loss": 0.0, "step": 251700 }, { "epoch": 25.18, "grad_norm": 2.0571696950355545e-06, "learning_rate": 8.060200668896322e-07, "loss": 0.0, "step": 251800 }, { "epoch": 25.19, "grad_norm": 1.0208295861957595e-06, "learning_rate": 8.043478260869565e-07, "loss": 0.0, "step": 251900 }, { "epoch": 25.2, "grad_norm": 2.6802188131114235e-06, "learning_rate": 8.02675585284281e-07, "loss": 0.0024, "step": 252000 }, { "epoch": 25.21, "grad_norm": 2.5915858259395463e-06, "learning_rate": 8.010033444816054e-07, "loss": 0.0065, "step": 252100 }, { "epoch": 25.22, "grad_norm": 1.4809822914685355e-06, "learning_rate": 7.993311036789299e-07, "loss": 0.0, "step": 252200 }, { "epoch": 25.23, "grad_norm": 1.2989927427042858e-06, "learning_rate": 7.976588628762543e-07, "loss": 0.0, "step": 252300 }, { "epoch": 25.24, "grad_norm": 1.4638616221418488e-06, "learning_rate": 7.959866220735786e-07, "loss": 0.0, "step": 252400 }, { "epoch": 25.25, "grad_norm": 3.649939117167378e-06, "learning_rate": 7.943143812709031e-07, "loss": 0.0023, "step": 252500 }, { "epoch": 25.26, "grad_norm": 1.3916521766077494e-06, "learning_rate": 7.926421404682274e-07, "loss": 0.0, "step": 252600 }, { "epoch": 25.27, "grad_norm": 1.5946108078424004e-06, "learning_rate": 7.90969899665552e-07, "loss": 0.0, "step": 252700 }, { "epoch": 25.28, "grad_norm": 1.121036348195048e-05, "learning_rate": 7.892976588628764e-07, "loss": 0.0, "step": 252800 }, { "epoch": 25.29, "grad_norm": 1.0152185723200091e-06, "learning_rate": 7.876254180602007e-07, "loss": 0.0, "step": 252900 }, { "epoch": 25.3, "grad_norm": 3.492725227260962e-05, "learning_rate": 7.859531772575252e-07, "loss": 0.0, "step": 253000 }, { "epoch": 25.31, "grad_norm": 2.6968546080752276e-06, "learning_rate": 7.842809364548495e-07, "loss": 0.0, "step": 253100 }, { "epoch": 25.32, "grad_norm": 1.7197486386066885e-06, "learning_rate": 7.82608695652174e-07, "loss": 0.0, "step": 253200 }, { "epoch": 25.33, "grad_norm": 1.6494540204803343e-06, "learning_rate": 7.809364548494983e-07, "loss": 0.0, "step": 253300 }, { "epoch": 25.34, "grad_norm": 9.803585498957545e-07, "learning_rate": 7.792642140468228e-07, "loss": 0.0, "step": 253400 }, { "epoch": 25.35, "grad_norm": 8.82857307260565e-07, "learning_rate": 7.775919732441473e-07, "loss": 0.0, "step": 253500 }, { "epoch": 25.36, "grad_norm": 2.8013680548610864e-06, "learning_rate": 7.759197324414716e-07, "loss": 0.0061, "step": 253600 }, { "epoch": 25.37, "grad_norm": 2.4795892841211753e-06, "learning_rate": 7.742474916387961e-07, "loss": 0.0, "step": 253700 }, { "epoch": 25.38, "grad_norm": 3.668871158879483e-06, "learning_rate": 7.725752508361204e-07, "loss": 0.0, "step": 253800 }, { "epoch": 25.39, "grad_norm": 4.860013632423943e-06, "learning_rate": 7.709030100334448e-07, "loss": 0.0, "step": 253900 }, { "epoch": 25.4, "grad_norm": 1.904245323203213e-06, "learning_rate": 7.692307692307694e-07, "loss": 0.0, "step": 254000 }, { "epoch": 25.41, "grad_norm": 2.4638909508212237e-06, "learning_rate": 7.675585284280937e-07, "loss": 0.0, "step": 254100 }, { "epoch": 25.42, "grad_norm": 2.0516115455393447e-06, "learning_rate": 7.658862876254181e-07, "loss": 0.0, "step": 254200 }, { "epoch": 25.43, "grad_norm": 1.3270642966745072e-06, "learning_rate": 7.642140468227425e-07, "loss": 0.0, "step": 254300 }, { "epoch": 25.44, "grad_norm": 1.8419576690575923e-06, "learning_rate": 7.625418060200669e-07, "loss": 0.0, "step": 254400 }, { "epoch": 25.45, "grad_norm": 1.4690833268105052e-05, "learning_rate": 7.608695652173914e-07, "loss": 0.0, "step": 254500 }, { "epoch": 25.46, "grad_norm": 0.00014639816072303802, "learning_rate": 7.591973244147158e-07, "loss": 0.0, "step": 254600 }, { "epoch": 25.47, "grad_norm": 3.1858828606345924e-06, "learning_rate": 7.575250836120402e-07, "loss": 0.0038, "step": 254700 }, { "epoch": 25.48, "grad_norm": 3.5060836580669275e-06, "learning_rate": 7.558528428093646e-07, "loss": 0.0, "step": 254800 }, { "epoch": 25.49, "grad_norm": 4.6844779717503116e-06, "learning_rate": 7.54180602006689e-07, "loss": 0.005, "step": 254900 }, { "epoch": 25.5, "grad_norm": 6.809616479586111e-07, "learning_rate": 7.525083612040135e-07, "loss": 0.0, "step": 255000 }, { "epoch": 25.5, "eval_accuracy": 0.987225, "eval_f1": 0.987225, "eval_loss": 0.15019439160823822, "eval_runtime": 133.4308, "eval_samples_per_second": 299.781, "eval_steps_per_second": 299.781, "step": 255000 }, { "epoch": 25.51, "grad_norm": 3.1307606604968896e-06, "learning_rate": 7.508361204013379e-07, "loss": 0.0, "step": 255100 }, { "epoch": 25.52, "grad_norm": 1.4498136806651019e-06, "learning_rate": 7.491638795986622e-07, "loss": 0.003, "step": 255200 }, { "epoch": 25.53, "grad_norm": 6.193661101860926e-05, "learning_rate": 7.474916387959867e-07, "loss": 0.0, "step": 255300 }, { "epoch": 25.54, "grad_norm": 1.8212824670627015e-06, "learning_rate": 7.458193979933111e-07, "loss": 0.0, "step": 255400 }, { "epoch": 25.55, "grad_norm": 2.457471509842435e-06, "learning_rate": 7.441471571906355e-07, "loss": 0.0, "step": 255500 }, { "epoch": 25.56, "grad_norm": 0.00020605024474207312, "learning_rate": 7.424749163879599e-07, "loss": 0.0, "step": 255600 }, { "epoch": 25.57, "grad_norm": 7.919943527667783e-06, "learning_rate": 7.408026755852843e-07, "loss": 0.0, "step": 255700 }, { "epoch": 25.58, "grad_norm": 9.080182508114376e-07, "learning_rate": 7.391304347826088e-07, "loss": 0.0, "step": 255800 }, { "epoch": 25.59, "grad_norm": 2.7886048883374315e-06, "learning_rate": 7.374581939799332e-07, "loss": 0.0, "step": 255900 }, { "epoch": 25.6, "grad_norm": 1.3184147746869712e-06, "learning_rate": 7.357859531772576e-07, "loss": 0.0, "step": 256000 }, { "epoch": 25.61, "grad_norm": 1.5250210481099202e-06, "learning_rate": 7.34113712374582e-07, "loss": 0.0, "step": 256100 }, { "epoch": 25.62, "grad_norm": 2.6570999125397066e-06, "learning_rate": 7.324414715719063e-07, "loss": 0.0, "step": 256200 }, { "epoch": 25.63, "grad_norm": 1.266855520043464e-06, "learning_rate": 7.307692307692309e-07, "loss": 0.0, "step": 256300 }, { "epoch": 25.64, "grad_norm": 1.032599357131403e-06, "learning_rate": 7.290969899665552e-07, "loss": 0.0, "step": 256400 }, { "epoch": 25.65, "grad_norm": 1.1953579814871773e-06, "learning_rate": 7.274247491638796e-07, "loss": 0.0, "step": 256500 }, { "epoch": 25.66, "grad_norm": 0.000963278638664633, "learning_rate": 7.257525083612041e-07, "loss": 0.0032, "step": 256600 }, { "epoch": 25.67, "grad_norm": 3.835300958598964e-05, "learning_rate": 7.240802675585284e-07, "loss": 0.0, "step": 256700 }, { "epoch": 25.68, "grad_norm": 1.6685755781509215e-06, "learning_rate": 7.22408026755853e-07, "loss": 0.0, "step": 256800 }, { "epoch": 25.69, "grad_norm": 2.4940084131230833e-06, "learning_rate": 7.207357859531773e-07, "loss": 0.0, "step": 256900 }, { "epoch": 25.7, "grad_norm": 1.5732797464806936e-06, "learning_rate": 7.190635451505017e-07, "loss": 0.0, "step": 257000 }, { "epoch": 25.71, "grad_norm": 9.188497642753646e-07, "learning_rate": 7.173913043478262e-07, "loss": 0.0, "step": 257100 }, { "epoch": 25.72, "grad_norm": 0.00019525196694303304, "learning_rate": 7.157190635451505e-07, "loss": 0.0, "step": 257200 }, { "epoch": 25.73, "grad_norm": 9.72332486526284e-07, "learning_rate": 7.140468227424751e-07, "loss": 0.0, "step": 257300 }, { "epoch": 25.74, "grad_norm": 2.251427076771506e-06, "learning_rate": 7.123745819397994e-07, "loss": 0.0, "step": 257400 }, { "epoch": 25.75, "grad_norm": 4.722788162325742e-06, "learning_rate": 7.107023411371238e-07, "loss": 0.0, "step": 257500 }, { "epoch": 25.76, "grad_norm": 4.204017841402674e-06, "learning_rate": 7.090301003344482e-07, "loss": 0.0, "step": 257600 }, { "epoch": 25.77, "grad_norm": 1.0709164826039341e-06, "learning_rate": 7.073578595317726e-07, "loss": 0.0, "step": 257700 }, { "epoch": 25.78, "grad_norm": 1.1540734021764365e-06, "learning_rate": 7.056856187290971e-07, "loss": 0.0, "step": 257800 }, { "epoch": 25.79, "grad_norm": 9.432333172298968e-07, "learning_rate": 7.040133779264215e-07, "loss": 0.0, "step": 257900 }, { "epoch": 25.8, "grad_norm": 8.946111051955086e-07, "learning_rate": 7.023411371237459e-07, "loss": 0.0, "step": 258000 }, { "epoch": 25.81, "grad_norm": 2.140501010217122e-06, "learning_rate": 7.006688963210703e-07, "loss": 0.0, "step": 258100 }, { "epoch": 25.82, "grad_norm": 9.419418915967981e-07, "learning_rate": 6.989966555183947e-07, "loss": 0.0, "step": 258200 }, { "epoch": 25.83, "grad_norm": 2.9447837732732296e-05, "learning_rate": 6.973244147157191e-07, "loss": 0.0, "step": 258300 }, { "epoch": 25.84, "grad_norm": 2.420624014121131e-06, "learning_rate": 6.956521739130435e-07, "loss": 0.0, "step": 258400 }, { "epoch": 25.85, "grad_norm": 0.011331389658153057, "learning_rate": 6.939799331103679e-07, "loss": 0.0026, "step": 258500 }, { "epoch": 25.86, "grad_norm": 5.007507934351452e-05, "learning_rate": 6.923076923076924e-07, "loss": 0.0, "step": 258600 }, { "epoch": 25.87, "grad_norm": 5.7870420278050005e-06, "learning_rate": 6.906354515050168e-07, "loss": 0.0042, "step": 258700 }, { "epoch": 25.88, "grad_norm": 0.0006274498300626874, "learning_rate": 6.889632107023412e-07, "loss": 0.0, "step": 258800 }, { "epoch": 25.89, "grad_norm": 1.82863800546329e-06, "learning_rate": 6.872909698996656e-07, "loss": 0.0, "step": 258900 }, { "epoch": 25.9, "grad_norm": 1.6336121007043403e-06, "learning_rate": 6.8561872909699e-07, "loss": 0.0076, "step": 259000 }, { "epoch": 25.91, "grad_norm": 4.9206792027689517e-05, "learning_rate": 6.839464882943145e-07, "loss": 0.0, "step": 259100 }, { "epoch": 25.92, "grad_norm": 1.827681217037025e-06, "learning_rate": 6.822742474916389e-07, "loss": 0.0, "step": 259200 }, { "epoch": 25.93, "grad_norm": 1.8825265897248755e-06, "learning_rate": 6.806020066889632e-07, "loss": 0.0, "step": 259300 }, { "epoch": 25.94, "grad_norm": 1.943802317327936e-06, "learning_rate": 6.789297658862877e-07, "loss": 0.0, "step": 259400 }, { "epoch": 25.95, "grad_norm": 2.9297807486727834e-05, "learning_rate": 6.77257525083612e-07, "loss": 0.0, "step": 259500 }, { "epoch": 25.96, "grad_norm": 6.956246943445876e-05, "learning_rate": 6.755852842809366e-07, "loss": 0.0, "step": 259600 }, { "epoch": 25.97, "grad_norm": 5.574890565185342e-06, "learning_rate": 6.73913043478261e-07, "loss": 0.0, "step": 259700 }, { "epoch": 25.98, "grad_norm": 1.5360980114564882e-06, "learning_rate": 6.722408026755853e-07, "loss": 0.0, "step": 259800 }, { "epoch": 25.99, "grad_norm": 0.00014320595073513687, "learning_rate": 6.705685618729098e-07, "loss": 0.0, "step": 259900 }, { "epoch": 26.0, "grad_norm": 2.329884182472597e-06, "learning_rate": 6.688963210702341e-07, "loss": 0.0, "step": 260000 }, { "epoch": 26.0, "eval_accuracy": 0.987775, "eval_f1": 0.987775, "eval_loss": 0.14540669322013855, "eval_runtime": 128.3816, "eval_samples_per_second": 311.571, "eval_steps_per_second": 311.571, "step": 260000 }, { "epoch": 26.01, "grad_norm": 3.0397443424590165e-06, "learning_rate": 6.672240802675586e-07, "loss": 0.0, "step": 260100 }, { "epoch": 26.02, "grad_norm": 2.670638423296623e-06, "learning_rate": 6.65551839464883e-07, "loss": 0.0, "step": 260200 }, { "epoch": 26.03, "grad_norm": 1.2526317050287616e-06, "learning_rate": 6.638795986622074e-07, "loss": 0.0, "step": 260300 }, { "epoch": 26.04, "grad_norm": 1.997796061914414e-06, "learning_rate": 6.622073578595319e-07, "loss": 0.0, "step": 260400 }, { "epoch": 26.05, "grad_norm": 6.544435677824367e-07, "learning_rate": 6.605351170568562e-07, "loss": 0.0043, "step": 260500 }, { "epoch": 26.06, "grad_norm": 1.0374384373790235e-06, "learning_rate": 6.588628762541807e-07, "loss": 0.0, "step": 260600 }, { "epoch": 26.07, "grad_norm": 1.676651550042152e-06, "learning_rate": 6.57190635451505e-07, "loss": 0.0, "step": 260700 }, { "epoch": 26.08, "grad_norm": 6.39155450699036e-06, "learning_rate": 6.555183946488294e-07, "loss": 0.0, "step": 260800 }, { "epoch": 26.09, "grad_norm": 2.87171360469074e-06, "learning_rate": 6.53846153846154e-07, "loss": 0.0, "step": 260900 }, { "epoch": 26.1, "grad_norm": 1.6348642475350061e-06, "learning_rate": 6.521739130434783e-07, "loss": 0.0, "step": 261000 }, { "epoch": 26.11, "grad_norm": 4.175280992058106e-06, "learning_rate": 6.505016722408028e-07, "loss": 0.0, "step": 261100 }, { "epoch": 26.12, "grad_norm": 0.014059566892683506, "learning_rate": 6.488294314381271e-07, "loss": 0.0, "step": 261200 }, { "epoch": 26.13, "grad_norm": 2.457397158650565e-06, "learning_rate": 6.471571906354515e-07, "loss": 0.0, "step": 261300 }, { "epoch": 26.14, "grad_norm": 2.0873145331279375e-05, "learning_rate": 6.45484949832776e-07, "loss": 0.0, "step": 261400 }, { "epoch": 26.15, "grad_norm": 2.946242375401198e-06, "learning_rate": 6.438127090301004e-07, "loss": 0.0, "step": 261500 }, { "epoch": 26.16, "grad_norm": 5.695550953532802e-06, "learning_rate": 6.421404682274248e-07, "loss": 0.0, "step": 261600 }, { "epoch": 26.17, "grad_norm": 1.6086561345218797e-06, "learning_rate": 6.404682274247492e-07, "loss": 0.0, "step": 261700 }, { "epoch": 26.18, "grad_norm": 3.3722619718901115e-06, "learning_rate": 6.387959866220736e-07, "loss": 0.0, "step": 261800 }, { "epoch": 26.19, "grad_norm": 1.4849182434772956e-06, "learning_rate": 6.371237458193981e-07, "loss": 0.0, "step": 261900 }, { "epoch": 26.2, "grad_norm": 3.5905916320189135e-06, "learning_rate": 6.354515050167225e-07, "loss": 0.0, "step": 262000 }, { "epoch": 26.21, "grad_norm": 1.8669687733563478e-06, "learning_rate": 6.337792642140468e-07, "loss": 0.0, "step": 262100 }, { "epoch": 26.22, "grad_norm": 9.240634426532779e-07, "learning_rate": 6.321070234113713e-07, "loss": 0.0, "step": 262200 }, { "epoch": 26.23, "grad_norm": 8.699660156707978e-07, "learning_rate": 6.304347826086957e-07, "loss": 0.0, "step": 262300 }, { "epoch": 26.24, "grad_norm": 2.213256948380149e-06, "learning_rate": 6.287625418060201e-07, "loss": 0.0, "step": 262400 }, { "epoch": 26.25, "grad_norm": 1.3434569154924247e-06, "learning_rate": 6.270903010033446e-07, "loss": 0.0, "step": 262500 }, { "epoch": 26.26, "grad_norm": 8.861171068019758e-07, "learning_rate": 6.254180602006689e-07, "loss": 0.0, "step": 262600 }, { "epoch": 26.27, "grad_norm": 1.178672050627938e-06, "learning_rate": 6.237458193979934e-07, "loss": 0.0, "step": 262700 }, { "epoch": 26.28, "grad_norm": 1.0737633147073211e-06, "learning_rate": 6.220735785953178e-07, "loss": 0.0, "step": 262800 }, { "epoch": 26.29, "grad_norm": 1.4926870335330022e-06, "learning_rate": 6.204013377926422e-07, "loss": 0.0, "step": 262900 }, { "epoch": 26.3, "grad_norm": 1.8841998326024623e-06, "learning_rate": 6.187290969899666e-07, "loss": 0.0, "step": 263000 }, { "epoch": 26.31, "grad_norm": 2.844284608727321e-06, "learning_rate": 6.17056856187291e-07, "loss": 0.0, "step": 263100 }, { "epoch": 26.32, "grad_norm": 4.620580511982553e-06, "learning_rate": 6.153846153846155e-07, "loss": 0.0, "step": 263200 }, { "epoch": 26.33, "grad_norm": 2.383153059781762e-06, "learning_rate": 6.137123745819398e-07, "loss": 0.0, "step": 263300 }, { "epoch": 26.34, "grad_norm": 1.2054698572683265e-06, "learning_rate": 6.120401337792643e-07, "loss": 0.0, "step": 263400 }, { "epoch": 26.35, "grad_norm": 1.1026223774024402e-06, "learning_rate": 6.103678929765887e-07, "loss": 0.0, "step": 263500 }, { "epoch": 26.36, "grad_norm": 2.2108922621555394e-06, "learning_rate": 6.086956521739131e-07, "loss": 0.0, "step": 263600 }, { "epoch": 26.37, "grad_norm": 9.278629136133532e-07, "learning_rate": 6.070234113712375e-07, "loss": 0.0, "step": 263700 }, { "epoch": 26.38, "grad_norm": 1.3845400417267228e-06, "learning_rate": 6.053511705685619e-07, "loss": 0.0, "step": 263800 }, { "epoch": 26.39, "grad_norm": 6.267321623454336e-06, "learning_rate": 6.036789297658863e-07, "loss": 0.0, "step": 263900 }, { "epoch": 26.4, "grad_norm": 1.303371732319647e-06, "learning_rate": 6.020066889632107e-07, "loss": 0.0, "step": 264000 }, { "epoch": 26.41, "grad_norm": 1.8336708080823882e-06, "learning_rate": 6.003344481605352e-07, "loss": 0.0, "step": 264100 }, { "epoch": 26.42, "grad_norm": 6.330028554657474e-05, "learning_rate": 5.986622073578596e-07, "loss": 0.0, "step": 264200 }, { "epoch": 26.43, "grad_norm": 1.0136543551197974e-06, "learning_rate": 5.96989966555184e-07, "loss": 0.0, "step": 264300 }, { "epoch": 26.44, "grad_norm": 3.786742354350281e-06, "learning_rate": 5.953177257525084e-07, "loss": 0.0045, "step": 264400 }, { "epoch": 26.45, "grad_norm": 1.3409010080067674e-06, "learning_rate": 5.936454849498328e-07, "loss": 0.0, "step": 264500 }, { "epoch": 26.46, "grad_norm": 1.185390374303097e-06, "learning_rate": 5.919732441471572e-07, "loss": 0.0, "step": 264600 }, { "epoch": 26.47, "grad_norm": 1.233201146533247e-05, "learning_rate": 5.903010033444817e-07, "loss": 0.0036, "step": 264700 }, { "epoch": 26.48, "grad_norm": 1.4137769994704286e-06, "learning_rate": 5.886287625418061e-07, "loss": 0.0, "step": 264800 }, { "epoch": 26.49, "grad_norm": 7.992609880602686e-07, "learning_rate": 5.869565217391305e-07, "loss": 0.0, "step": 264900 }, { "epoch": 26.5, "grad_norm": 1.5214644690786372e-06, "learning_rate": 5.852842809364549e-07, "loss": 0.0, "step": 265000 }, { "epoch": 26.5, "eval_accuracy": 0.987925, "eval_f1": 0.987925, "eval_loss": 0.14723196625709534, "eval_runtime": 127.175, "eval_samples_per_second": 314.527, "eval_steps_per_second": 314.527, "step": 265000 }, { "epoch": 26.51, "grad_norm": 8.622855602880009e-06, "learning_rate": 5.836120401337793e-07, "loss": 0.0, "step": 265100 }, { "epoch": 26.52, "grad_norm": 3.604415906011127e-05, "learning_rate": 5.819397993311037e-07, "loss": 0.0, "step": 265200 }, { "epoch": 26.53, "grad_norm": 1.473955080655287e-06, "learning_rate": 5.802675585284281e-07, "loss": 0.0, "step": 265300 }, { "epoch": 26.54, "grad_norm": 2.554420461819973e-06, "learning_rate": 5.785953177257526e-07, "loss": 0.0, "step": 265400 }, { "epoch": 26.55, "grad_norm": 2.2791825813328614e-06, "learning_rate": 5.76923076923077e-07, "loss": 0.0, "step": 265500 }, { "epoch": 26.56, "grad_norm": 1.3585033684648806e-06, "learning_rate": 5.752508361204014e-07, "loss": 0.0, "step": 265600 }, { "epoch": 26.57, "grad_norm": 1.1605886811594246e-06, "learning_rate": 5.735785953177258e-07, "loss": 0.0, "step": 265700 }, { "epoch": 26.58, "grad_norm": 1.446388523618225e-06, "learning_rate": 5.719063545150502e-07, "loss": 0.0, "step": 265800 }, { "epoch": 26.59, "grad_norm": 1.4642024552813382e-06, "learning_rate": 5.702341137123746e-07, "loss": 0.0, "step": 265900 }, { "epoch": 26.6, "grad_norm": 1.8093830931320554e-06, "learning_rate": 5.68561872909699e-07, "loss": 0.0, "step": 266000 }, { "epoch": 26.61, "grad_norm": 2.076187001875951e-06, "learning_rate": 5.668896321070235e-07, "loss": 0.0, "step": 266100 }, { "epoch": 26.62, "grad_norm": 5.8339724091638345e-06, "learning_rate": 5.652173913043478e-07, "loss": 0.0, "step": 266200 }, { "epoch": 26.63, "grad_norm": 1.15121656563133e-06, "learning_rate": 5.635451505016723e-07, "loss": 0.0, "step": 266300 }, { "epoch": 26.64, "grad_norm": 9.865797210295568e-07, "learning_rate": 5.618729096989967e-07, "loss": 0.0, "step": 266400 }, { "epoch": 26.65, "grad_norm": 3.917589765478624e-06, "learning_rate": 5.602006688963211e-07, "loss": 0.0, "step": 266500 }, { "epoch": 26.66, "grad_norm": 5.671467988577206e-07, "learning_rate": 5.585284280936456e-07, "loss": 0.0, "step": 266600 }, { "epoch": 26.67, "grad_norm": 3.5159577237209305e-05, "learning_rate": 5.568561872909699e-07, "loss": 0.0, "step": 266700 }, { "epoch": 26.68, "grad_norm": 9.57158249548229e-07, "learning_rate": 5.551839464882944e-07, "loss": 0.0054, "step": 266800 }, { "epoch": 26.69, "grad_norm": 9.69288748819963e-07, "learning_rate": 5.535117056856188e-07, "loss": 0.0, "step": 266900 }, { "epoch": 26.7, "grad_norm": 6.826439403084805e-06, "learning_rate": 5.518394648829432e-07, "loss": 0.0, "step": 267000 }, { "epoch": 26.71, "grad_norm": 1.5509156128246104e-06, "learning_rate": 5.501672240802676e-07, "loss": 0.0, "step": 267100 }, { "epoch": 26.72, "grad_norm": 1.6871094885573257e-06, "learning_rate": 5.48494983277592e-07, "loss": 0.0, "step": 267200 }, { "epoch": 26.73, "grad_norm": 7.855343255869229e-07, "learning_rate": 5.468227424749165e-07, "loss": 0.0, "step": 267300 }, { "epoch": 26.74, "grad_norm": 7.116402116480458e-07, "learning_rate": 5.451505016722408e-07, "loss": 0.0047, "step": 267400 }, { "epoch": 26.75, "grad_norm": 1.349264834971109e-06, "learning_rate": 5.434782608695653e-07, "loss": 0.0, "step": 267500 }, { "epoch": 26.76, "grad_norm": 1.5048664181449567e-06, "learning_rate": 5.418060200668896e-07, "loss": 0.0, "step": 267600 }, { "epoch": 26.77, "grad_norm": 1.0677230420697015e-05, "learning_rate": 5.401337792642141e-07, "loss": 0.0, "step": 267700 }, { "epoch": 26.78, "grad_norm": 1.0198332347499672e-06, "learning_rate": 5.384615384615386e-07, "loss": 0.0, "step": 267800 }, { "epoch": 26.79, "grad_norm": 1.7846756463768543e-06, "learning_rate": 5.367892976588629e-07, "loss": 0.0, "step": 267900 }, { "epoch": 26.8, "grad_norm": 9.937119784808601e-07, "learning_rate": 5.351170568561874e-07, "loss": 0.0, "step": 268000 }, { "epoch": 26.81, "grad_norm": 1.4857420183034264e-06, "learning_rate": 5.334448160535117e-07, "loss": 0.0, "step": 268100 }, { "epoch": 26.82, "grad_norm": 1.405791294928349e-06, "learning_rate": 5.317725752508362e-07, "loss": 0.0, "step": 268200 }, { "epoch": 26.83, "grad_norm": 1.7296881651418516e-06, "learning_rate": 5.301003344481605e-07, "loss": 0.0, "step": 268300 }, { "epoch": 26.84, "grad_norm": 1.4348109289130662e-05, "learning_rate": 5.28428093645485e-07, "loss": 0.0, "step": 268400 }, { "epoch": 26.85, "grad_norm": 1.8275501361131319e-06, "learning_rate": 5.267558528428094e-07, "loss": 0.0, "step": 268500 }, { "epoch": 26.86, "grad_norm": 1.2868830481238547e-06, "learning_rate": 5.250836120401338e-07, "loss": 0.0, "step": 268600 }, { "epoch": 26.87, "grad_norm": 1.6959703543761862e-06, "learning_rate": 5.234113712374583e-07, "loss": 0.0, "step": 268700 }, { "epoch": 26.88, "grad_norm": 1.57480280904565e-06, "learning_rate": 5.217391304347826e-07, "loss": 0.0, "step": 268800 }, { "epoch": 26.89, "grad_norm": 1.4843112694507e-06, "learning_rate": 5.200668896321071e-07, "loss": 0.0015, "step": 268900 }, { "epoch": 26.9, "grad_norm": 1.2535028872662224e-06, "learning_rate": 5.183946488294314e-07, "loss": 0.0, "step": 269000 }, { "epoch": 26.91, "grad_norm": 0.00023764085199218243, "learning_rate": 5.167224080267559e-07, "loss": 0.0, "step": 269100 }, { "epoch": 26.92, "grad_norm": 0.00029709271620959044, "learning_rate": 5.150501672240804e-07, "loss": 0.0, "step": 269200 }, { "epoch": 26.93, "grad_norm": 7.498289846807893e-07, "learning_rate": 5.133779264214047e-07, "loss": 0.0, "step": 269300 }, { "epoch": 26.94, "grad_norm": 1.443891278540832e-06, "learning_rate": 5.117056856187292e-07, "loss": 0.0, "step": 269400 }, { "epoch": 26.95, "grad_norm": 9.474859689362347e-05, "learning_rate": 5.100334448160535e-07, "loss": 0.0, "step": 269500 }, { "epoch": 26.96, "grad_norm": 1.0035858394985553e-05, "learning_rate": 5.08361204013378e-07, "loss": 0.0, "step": 269600 }, { "epoch": 26.97, "grad_norm": 2.1819903395225992e-06, "learning_rate": 5.066889632107023e-07, "loss": 0.0034, "step": 269700 }, { "epoch": 26.98, "grad_norm": 3.6087269108975306e-05, "learning_rate": 5.050167224080268e-07, "loss": 0.0, "step": 269800 }, { "epoch": 26.99, "grad_norm": 9.546632782075903e-07, "learning_rate": 5.033444816053512e-07, "loss": 0.0, "step": 269900 }, { "epoch": 27.0, "grad_norm": 3.2282516713166842e-06, "learning_rate": 5.016722408026756e-07, "loss": 0.0, "step": 270000 }, { "epoch": 27.0, "eval_accuracy": 0.988125, "eval_f1": 0.988125, "eval_loss": 0.1494733989238739, "eval_runtime": 128.4859, "eval_samples_per_second": 311.318, "eval_steps_per_second": 311.318, "step": 270000 }, { "epoch": 27.01, "grad_norm": 6.7214832597528584e-06, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "step": 270100 }, { "epoch": 27.02, "grad_norm": 1.6825376860651886e-06, "learning_rate": 4.983277591973244e-07, "loss": 0.0, "step": 270200 }, { "epoch": 27.03, "grad_norm": 3.110191073574242e-06, "learning_rate": 4.966555183946489e-07, "loss": 0.0, "step": 270300 }, { "epoch": 27.04, "grad_norm": 2.0460465748328716e-06, "learning_rate": 4.949832775919733e-07, "loss": 0.0, "step": 270400 }, { "epoch": 27.05, "grad_norm": 1.6207150110858493e-06, "learning_rate": 4.933110367892977e-07, "loss": 0.0, "step": 270500 }, { "epoch": 27.06, "grad_norm": 1.8051798633678118e-06, "learning_rate": 4.916387959866222e-07, "loss": 0.0, "step": 270600 }, { "epoch": 27.07, "grad_norm": 4.891130629403051e-06, "learning_rate": 4.899665551839465e-07, "loss": 0.0, "step": 270700 }, { "epoch": 27.08, "grad_norm": 1.253480604646029e-06, "learning_rate": 4.88294314381271e-07, "loss": 0.0, "step": 270800 }, { "epoch": 27.09, "grad_norm": 1.683352252257464e-06, "learning_rate": 4.866220735785953e-07, "loss": 0.0, "step": 270900 }, { "epoch": 27.1, "grad_norm": 5.776287434855476e-05, "learning_rate": 4.849498327759198e-07, "loss": 0.0, "step": 271000 }, { "epoch": 27.11, "grad_norm": 5.219275863055373e-06, "learning_rate": 4.832775919732442e-07, "loss": 0.0, "step": 271100 }, { "epoch": 27.12, "grad_norm": 1.6335097825503908e-05, "learning_rate": 4.816053511705686e-07, "loss": 0.0, "step": 271200 }, { "epoch": 27.13, "grad_norm": 1.0641881544870557e-06, "learning_rate": 4.79933110367893e-07, "loss": 0.0, "step": 271300 }, { "epoch": 27.14, "grad_norm": 9.760987040863256e-07, "learning_rate": 4.782608695652174e-07, "loss": 0.0, "step": 271400 }, { "epoch": 27.15, "grad_norm": 1.0345853524995618e-06, "learning_rate": 4.765886287625419e-07, "loss": 0.0, "step": 271500 }, { "epoch": 27.16, "grad_norm": 8.688991783856181e-07, "learning_rate": 4.7491638795986625e-07, "loss": 0.0, "step": 271600 }, { "epoch": 27.17, "grad_norm": 1.5890523172856774e-06, "learning_rate": 4.7324414715719066e-07, "loss": 0.0, "step": 271700 }, { "epoch": 27.18, "grad_norm": 0.00010488680709386244, "learning_rate": 4.715719063545151e-07, "loss": 0.0, "step": 271800 }, { "epoch": 27.19, "grad_norm": 2.86882800537569e-06, "learning_rate": 4.698996655518395e-07, "loss": 0.0, "step": 271900 }, { "epoch": 27.2, "grad_norm": 1.1794188139901962e-05, "learning_rate": 4.682274247491639e-07, "loss": 0.0, "step": 272000 }, { "epoch": 27.21, "grad_norm": 1.7052036582754226e-06, "learning_rate": 4.665551839464883e-07, "loss": 0.0, "step": 272100 }, { "epoch": 27.22, "grad_norm": 8.556891657462984e-07, "learning_rate": 4.6488294314381275e-07, "loss": 0.0, "step": 272200 }, { "epoch": 27.23, "grad_norm": 1.2632311836568988e-06, "learning_rate": 4.6321070234113715e-07, "loss": 0.0, "step": 272300 }, { "epoch": 27.24, "grad_norm": 1.4065376490179915e-06, "learning_rate": 4.615384615384616e-07, "loss": 0.0, "step": 272400 }, { "epoch": 27.25, "grad_norm": 3.707805080921389e-05, "learning_rate": 4.5986622073578597e-07, "loss": 0.0, "step": 272500 }, { "epoch": 27.26, "grad_norm": 6.321478394966107e-06, "learning_rate": 4.581939799331104e-07, "loss": 0.0, "step": 272600 }, { "epoch": 27.27, "grad_norm": 1.3622416190628428e-06, "learning_rate": 4.5652173913043484e-07, "loss": 0.0, "step": 272700 }, { "epoch": 27.28, "grad_norm": 1.52013853949029e-06, "learning_rate": 4.5484949832775925e-07, "loss": 0.0, "step": 272800 }, { "epoch": 27.29, "grad_norm": 6.5507720137247816e-06, "learning_rate": 4.531772575250837e-07, "loss": 0.0, "step": 272900 }, { "epoch": 27.3, "grad_norm": 9.356578630104195e-06, "learning_rate": 4.5150501672240806e-07, "loss": 0.0, "step": 273000 }, { "epoch": 27.31, "grad_norm": 1.1886962738572038e-06, "learning_rate": 4.4983277591973247e-07, "loss": 0.0, "step": 273100 }, { "epoch": 27.32, "grad_norm": 6.281496098381467e-06, "learning_rate": 4.481605351170569e-07, "loss": 0.0, "step": 273200 }, { "epoch": 27.33, "grad_norm": 9.167351890937425e-07, "learning_rate": 4.4648829431438134e-07, "loss": 0.0, "step": 273300 }, { "epoch": 27.34, "grad_norm": 6.569893571395369e-07, "learning_rate": 4.448160535117057e-07, "loss": 0.0, "step": 273400 }, { "epoch": 27.35, "grad_norm": 0.00026023961254395545, "learning_rate": 4.431438127090301e-07, "loss": 0.0, "step": 273500 }, { "epoch": 27.36, "grad_norm": 1.3729372767556924e-06, "learning_rate": 4.4147157190635456e-07, "loss": 0.0, "step": 273600 }, { "epoch": 27.37, "grad_norm": 2.290072643518215e-06, "learning_rate": 4.3979933110367897e-07, "loss": 0.0, "step": 273700 }, { "epoch": 27.38, "grad_norm": 1.0271631936120684e-06, "learning_rate": 4.3812709030100343e-07, "loss": 0.0, "step": 273800 }, { "epoch": 27.39, "grad_norm": 7.632092433595972e-07, "learning_rate": 4.364548494983278e-07, "loss": 0.0, "step": 273900 }, { "epoch": 27.4, "grad_norm": 2.6731247544375947e-06, "learning_rate": 4.347826086956522e-07, "loss": 0.0, "step": 274000 }, { "epoch": 27.41, "grad_norm": 1.3995519111631438e-05, "learning_rate": 4.3311036789297665e-07, "loss": 0.0, "step": 274100 }, { "epoch": 27.42, "grad_norm": 1.2089567462680861e-06, "learning_rate": 4.3143812709030106e-07, "loss": 0.0, "step": 274200 }, { "epoch": 27.43, "grad_norm": 1.994718104469939e-06, "learning_rate": 4.2976588628762546e-07, "loss": 0.0038, "step": 274300 }, { "epoch": 27.44, "grad_norm": 1.106251716009865e-06, "learning_rate": 4.2809364548494987e-07, "loss": 0.0, "step": 274400 }, { "epoch": 27.45, "grad_norm": 0.00026735287974588573, "learning_rate": 4.264214046822743e-07, "loss": 0.0, "step": 274500 }, { "epoch": 27.46, "grad_norm": 2.463541704855743e-06, "learning_rate": 4.247491638795987e-07, "loss": 0.0, "step": 274600 }, { "epoch": 27.47, "grad_norm": 4.47788579549524e-06, "learning_rate": 4.2307692307692315e-07, "loss": 0.0031, "step": 274700 }, { "epoch": 27.48, "grad_norm": 5.374798206503328e-07, "learning_rate": 4.214046822742475e-07, "loss": 0.0, "step": 274800 }, { "epoch": 27.49, "grad_norm": 1.548745672153018e-06, "learning_rate": 4.197324414715719e-07, "loss": 0.0, "step": 274900 }, { "epoch": 27.5, "grad_norm": 6.070744120734162e-07, "learning_rate": 4.1806020066889637e-07, "loss": 0.0004, "step": 275000 }, { "epoch": 27.5, "eval_accuracy": 0.98815, "eval_f1": 0.98815, "eval_loss": 0.15360614657402039, "eval_runtime": 127.9472, "eval_samples_per_second": 312.629, "eval_steps_per_second": 312.629, "step": 275000 } ], "logging_steps": 100, "max_steps": 300000, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.865769415769616e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }