{ "best_global_step": 25362, "best_metric": 0.7539058443703921, "best_model_checkpoint": "output/QA-Llama-3.1-4155/checkpoint-25362", "epoch": 3.0, "eval_steps": 500, "global_step": 25362, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011828720132481666, "grad_norm": 123.34188842773438, "learning_rate": 8.331343883367672e-08, "loss": 0.92, "step": 10 }, { "epoch": 0.0023657440264963333, "grad_norm": 106.9783935546875, "learning_rate": 1.0839328296452977e-07, "loss": 0.8052, "step": 20 }, { "epoch": 0.0035486160397444995, "grad_norm": 70.16786193847656, "learning_rate": 1.230640513050104e-07, "loss": 0.5851, "step": 30 }, { "epoch": 0.0047314880529926665, "grad_norm": 38.23607635498047, "learning_rate": 1.3347312709538282e-07, "loss": 0.3751, "step": 40 }, { "epoch": 0.005914360066240833, "grad_norm": 24.72532844543457, "learning_rate": 1.4154703353650035e-07, "loss": 0.2294, "step": 50 }, { "epoch": 0.007097232079488999, "grad_norm": 8.344695091247559, "learning_rate": 1.4814389543586343e-07, "loss": 0.212, "step": 60 }, { "epoch": 0.008280104092737165, "grad_norm": 6.363811016082764, "learning_rate": 1.5372146269886458e-07, "loss": 0.2126, "step": 70 }, { "epoch": 0.009462976105985333, "grad_norm": 11.662599563598633, "learning_rate": 1.5855297122623586e-07, "loss": 0.1907, "step": 80 }, { "epoch": 0.0106458481192335, "grad_norm": 10.897701263427734, "learning_rate": 1.6281466377634406e-07, "loss": 0.1804, "step": 90 }, { "epoch": 0.011828720132481665, "grad_norm": 7.011815071105957, "learning_rate": 1.6662687766735344e-07, "loss": 0.1785, "step": 100 }, { "epoch": 0.013011592145729832, "grad_norm": 6.473340034484863, "learning_rate": 1.7007544461044483e-07, "loss": 0.1682, "step": 110 }, { "epoch": 0.014194464158977998, "grad_norm": 10.153539657592773, "learning_rate": 1.732237395667165e-07, "loss": 0.1666, "step": 120 }, { "epoch": 0.015377336172226166, "grad_norm": 10.031224250793457, "learning_rate": 1.7611989018027314e-07, "loss": 0.163, "step": 130 }, { "epoch": 0.01656020818547433, "grad_norm": 5.905551910400391, "learning_rate": 1.7880130682971762e-07, "loss": 0.162, "step": 140 }, { "epoch": 0.017743080198722498, "grad_norm": 9.35179328918457, "learning_rate": 1.8129764600783402e-07, "loss": 0.1624, "step": 150 }, { "epoch": 0.018925952211970666, "grad_norm": 7.6164727210998535, "learning_rate": 1.8363281535708893e-07, "loss": 0.1454, "step": 160 }, { "epoch": 0.02010882422521883, "grad_norm": 8.535870552062988, "learning_rate": 1.8582636978288902e-07, "loss": 0.1514, "step": 170 }, { "epoch": 0.021291696238467, "grad_norm": 13.379010200500488, "learning_rate": 1.8789450790719713e-07, "loss": 0.1559, "step": 180 }, { "epoch": 0.022474568251715163, "grad_norm": 14.642189979553223, "learning_rate": 1.8985079875000403e-07, "loss": 0.1454, "step": 190 }, { "epoch": 0.02365744026496333, "grad_norm": 8.976862907409668, "learning_rate": 1.9170672179820648e-07, "loss": 0.1385, "step": 200 }, { "epoch": 0.0248403122782115, "grad_norm": 8.665103912353516, "learning_rate": 1.9347207517019823e-07, "loss": 0.1491, "step": 210 }, { "epoch": 0.026023184291459663, "grad_norm": 9.992353439331055, "learning_rate": 1.951552887412979e-07, "loss": 0.1442, "step": 220 }, { "epoch": 0.02720605630470783, "grad_norm": 8.408617973327637, "learning_rate": 1.9676366760784337e-07, "loss": 0.1227, "step": 230 }, { "epoch": 0.028388928317955996, "grad_norm": 9.101628303527832, "learning_rate": 1.9830358369756956e-07, "loss": 0.1274, "step": 240 }, { "epoch": 0.029571800331204164, "grad_norm": 6.791329860687256, "learning_rate": 1.9978062823932398e-07, "loss": 0.1319, "step": 250 }, { "epoch": 0.03075467234445233, "grad_norm": 9.759270668029785, "learning_rate": 2.011997343111262e-07, "loss": 0.1337, "step": 260 }, { "epoch": 0.0319375443577005, "grad_norm": 11.024235725402832, "learning_rate": 2.0256527624767776e-07, "loss": 0.1354, "step": 270 }, { "epoch": 0.03312041637094866, "grad_norm": 12.888561248779297, "learning_rate": 2.0388115096057069e-07, "loss": 0.1232, "step": 280 }, { "epoch": 0.03430328838419683, "grad_norm": 15.938399314880371, "learning_rate": 2.0515084498212263e-07, "loss": 0.1153, "step": 290 }, { "epoch": 0.035486160397444996, "grad_norm": 5.185990333557129, "learning_rate": 2.0637749013868709e-07, "loss": 0.111, "step": 300 }, { "epoch": 0.036669032410693164, "grad_norm": 8.360689163208008, "learning_rate": 2.0756391009182688e-07, "loss": 0.1279, "step": 310 }, { "epoch": 0.03785190442394133, "grad_norm": 11.132458686828613, "learning_rate": 2.08712659487942e-07, "loss": 0.1303, "step": 320 }, { "epoch": 0.03903477643718949, "grad_norm": 11.105474472045898, "learning_rate": 2.098260570817785e-07, "loss": 0.1158, "step": 330 }, { "epoch": 0.04021764845043766, "grad_norm": 9.6586275100708, "learning_rate": 2.109062139137421e-07, "loss": 0.1188, "step": 340 }, { "epoch": 0.04140052046368583, "grad_norm": 10.559638023376465, "learning_rate": 2.1195505740168819e-07, "loss": 0.1121, "step": 350 }, { "epoch": 0.042583392476934, "grad_norm": 6.894542217254639, "learning_rate": 2.129743520380502e-07, "loss": 0.1245, "step": 360 }, { "epoch": 0.043766264490182165, "grad_norm": 7.495611667633057, "learning_rate": 2.1396571725059865e-07, "loss": 0.121, "step": 370 }, { "epoch": 0.044949136503430326, "grad_norm": 11.715258598327637, "learning_rate": 2.149306428808571e-07, "loss": 0.1211, "step": 380 }, { "epoch": 0.046132008516678494, "grad_norm": 14.833856582641602, "learning_rate": 2.158705026516068e-07, "loss": 0.1146, "step": 390 }, { "epoch": 0.04731488052992666, "grad_norm": 14.33793830871582, "learning_rate": 2.1678656592905954e-07, "loss": 0.1162, "step": 400 }, { "epoch": 0.04849775254317483, "grad_norm": 12.371674537658691, "learning_rate": 2.176800080324376e-07, "loss": 0.1194, "step": 410 }, { "epoch": 0.049680624556423, "grad_norm": 9.522781372070312, "learning_rate": 2.185519193010513e-07, "loss": 0.1155, "step": 420 }, { "epoch": 0.05086349656967116, "grad_norm": 9.73621940612793, "learning_rate": 2.1940331309434692e-07, "loss": 0.0995, "step": 430 }, { "epoch": 0.052046368582919326, "grad_norm": 12.111953735351562, "learning_rate": 2.2023513287215096e-07, "loss": 0.1089, "step": 440 }, { "epoch": 0.053229240596167494, "grad_norm": 8.75721549987793, "learning_rate": 2.210482584791677e-07, "loss": 0.1096, "step": 450 }, { "epoch": 0.05441211260941566, "grad_norm": 7.599644184112549, "learning_rate": 2.2184351173869644e-07, "loss": 0.1128, "step": 460 }, { "epoch": 0.05559498462266383, "grad_norm": 7.008472442626953, "learning_rate": 2.2262166144472595e-07, "loss": 0.1146, "step": 470 }, { "epoch": 0.05677785663591199, "grad_norm": 12.320075988769531, "learning_rate": 2.2338342782842262e-07, "loss": 0.1189, "step": 480 }, { "epoch": 0.05796072864916016, "grad_norm": 7.030301570892334, "learning_rate": 2.2412948656405242e-07, "loss": 0.1005, "step": 490 }, { "epoch": 0.05914360066240833, "grad_norm": 9.608684539794922, "learning_rate": 2.2486047237017704e-07, "loss": 0.0999, "step": 500 }, { "epoch": 0.060326472675656495, "grad_norm": 8.57749080657959, "learning_rate": 2.2557698225422267e-07, "loss": 0.1117, "step": 510 }, { "epoch": 0.06150934468890466, "grad_norm": 9.727421760559082, "learning_rate": 2.2627957844197927e-07, "loss": 0.1032, "step": 520 }, { "epoch": 0.06269221670215283, "grad_norm": 6.170521259307861, "learning_rate": 2.2696879102804675e-07, "loss": 0.0974, "step": 530 }, { "epoch": 0.063875088715401, "grad_norm": 8.939896583557129, "learning_rate": 2.276451203785308e-07, "loss": 0.1033, "step": 540 }, { "epoch": 0.06505796072864917, "grad_norm": 10.871493339538574, "learning_rate": 2.2830903931326846e-07, "loss": 0.0975, "step": 550 }, { "epoch": 0.06624083274189732, "grad_norm": 6.474706649780273, "learning_rate": 2.2896099509142372e-07, "loss": 0.1127, "step": 560 }, { "epoch": 0.06742370475514549, "grad_norm": 5.916224002838135, "learning_rate": 2.2960141122133776e-07, "loss": 0.1024, "step": 570 }, { "epoch": 0.06860657676839366, "grad_norm": 8.948817253112793, "learning_rate": 2.302306891129757e-07, "loss": 0.0978, "step": 580 }, { "epoch": 0.06978944878164182, "grad_norm": 13.98409366607666, "learning_rate": 2.3084920958911782e-07, "loss": 0.1075, "step": 590 }, { "epoch": 0.07097232079488999, "grad_norm": 7.833242893218994, "learning_rate": 2.3145733426954015e-07, "loss": 0.0993, "step": 600 }, { "epoch": 0.07215519280813816, "grad_norm": 5.916186332702637, "learning_rate": 2.3205540684078437e-07, "loss": 0.1058, "step": 610 }, { "epoch": 0.07333806482138633, "grad_norm": 8.247550964355469, "learning_rate": 2.3264375422267994e-07, "loss": 0.1042, "step": 620 }, { "epoch": 0.0745209368346345, "grad_norm": 5.887234687805176, "learning_rate": 2.332226876415319e-07, "loss": 0.0894, "step": 630 }, { "epoch": 0.07570380884788266, "grad_norm": 14.199374198913574, "learning_rate": 2.3379250361879505e-07, "loss": 0.0992, "step": 640 }, { "epoch": 0.07688668086113083, "grad_norm": 7.511618614196777, "learning_rate": 2.3435348488309677e-07, "loss": 0.0959, "step": 650 }, { "epoch": 0.07806955287437899, "grad_norm": 6.56810188293457, "learning_rate": 2.3490590121263156e-07, "loss": 0.1011, "step": 660 }, { "epoch": 0.07925242488762715, "grad_norm": 10.343645095825195, "learning_rate": 2.3545001021421026e-07, "loss": 0.0953, "step": 670 }, { "epoch": 0.08043529690087532, "grad_norm": 8.300405502319336, "learning_rate": 2.3598605804459512e-07, "loss": 0.0943, "step": 680 }, { "epoch": 0.08161816891412349, "grad_norm": 7.243466377258301, "learning_rate": 2.3651428007917702e-07, "loss": 0.088, "step": 690 }, { "epoch": 0.08280104092737166, "grad_norm": 7.334621906280518, "learning_rate": 2.3703490153254125e-07, "loss": 0.1081, "step": 700 }, { "epoch": 0.08398391294061983, "grad_norm": 6.599380016326904, "learning_rate": 2.3754813803501674e-07, "loss": 0.1116, "step": 710 }, { "epoch": 0.085166784953868, "grad_norm": 6.391559600830078, "learning_rate": 2.3805419616890326e-07, "loss": 0.0997, "step": 720 }, { "epoch": 0.08634965696711616, "grad_norm": 10.18539810180664, "learning_rate": 2.3855327396771385e-07, "loss": 0.098, "step": 730 }, { "epoch": 0.08753252898036433, "grad_norm": 6.910757064819336, "learning_rate": 2.390455613814517e-07, "loss": 0.0956, "step": 740 }, { "epoch": 0.0887154009936125, "grad_norm": 10.466629028320312, "learning_rate": 2.3953124071065765e-07, "loss": 0.0966, "step": 750 }, { "epoch": 0.08989827300686065, "grad_norm": 5.739569187164307, "learning_rate": 2.400104870117102e-07, "loss": 0.1085, "step": 760 }, { "epoch": 0.09108114502010882, "grad_norm": 6.02729606628418, "learning_rate": 2.4048346847563264e-07, "loss": 0.0969, "step": 770 }, { "epoch": 0.09226401703335699, "grad_norm": 6.7907891273498535, "learning_rate": 2.409503467824599e-07, "loss": 0.0895, "step": 780 }, { "epoch": 0.09344688904660516, "grad_norm": 6.296230316162109, "learning_rate": 2.4141127743303073e-07, "loss": 0.0917, "step": 790 }, { "epoch": 0.09462976105985332, "grad_norm": 7.377506256103516, "learning_rate": 2.418664100599126e-07, "loss": 0.0912, "step": 800 }, { "epoch": 0.09581263307310149, "grad_norm": 6.594316482543945, "learning_rate": 2.423158887190114e-07, "loss": 0.105, "step": 810 }, { "epoch": 0.09699550508634966, "grad_norm": 6.555089473724365, "learning_rate": 2.427598521632906e-07, "loss": 0.1013, "step": 820 }, { "epoch": 0.09817837709959783, "grad_norm": 7.4037065505981445, "learning_rate": 2.431984340998997e-07, "loss": 0.1025, "step": 830 }, { "epoch": 0.099361249112846, "grad_norm": 10.444832801818848, "learning_rate": 2.4363176343190433e-07, "loss": 0.0939, "step": 840 }, { "epoch": 0.10054412112609416, "grad_norm": 5.749758720397949, "learning_rate": 2.440599644857127e-07, "loss": 0.0974, "step": 850 }, { "epoch": 0.10172699313934232, "grad_norm": 6.279892444610596, "learning_rate": 2.444831572252e-07, "loss": 0.1011, "step": 860 }, { "epoch": 0.10290986515259049, "grad_norm": 9.80026912689209, "learning_rate": 2.4490145745345636e-07, "loss": 0.0974, "step": 870 }, { "epoch": 0.10409273716583865, "grad_norm": 6.575058460235596, "learning_rate": 2.45314977003004e-07, "loss": 0.1021, "step": 880 }, { "epoch": 0.10527560917908682, "grad_norm": 11.099271774291992, "learning_rate": 2.457238239152683e-07, "loss": 0.1085, "step": 890 }, { "epoch": 0.10645848119233499, "grad_norm": 3.821603775024414, "learning_rate": 2.461281026100208e-07, "loss": 0.0898, "step": 900 }, { "epoch": 0.10764135320558316, "grad_norm": 9.390056610107422, "learning_rate": 2.46527914045461e-07, "loss": 0.0982, "step": 910 }, { "epoch": 0.10882422521883132, "grad_norm": 6.1678690910339355, "learning_rate": 2.469233558695495e-07, "loss": 0.0938, "step": 920 }, { "epoch": 0.11000709723207949, "grad_norm": 7.588784217834473, "learning_rate": 2.4731452256316055e-07, "loss": 0.0868, "step": 930 }, { "epoch": 0.11118996924532766, "grad_norm": 6.578592300415039, "learning_rate": 2.4770150557557904e-07, "loss": 0.0989, "step": 940 }, { "epoch": 0.11237284125857583, "grad_norm": 3.994450569152832, "learning_rate": 2.480843934528277e-07, "loss": 0.0872, "step": 950 }, { "epoch": 0.11355571327182398, "grad_norm": 4.998359680175781, "learning_rate": 2.484632719592757e-07, "loss": 0.0914, "step": 960 }, { "epoch": 0.11473858528507215, "grad_norm": 5.349532604217529, "learning_rate": 2.488382241929453e-07, "loss": 0.088, "step": 970 }, { "epoch": 0.11592145729832032, "grad_norm": 5.934182643890381, "learning_rate": 2.4920933069490546e-07, "loss": 0.0843, "step": 980 }, { "epoch": 0.11710432931156849, "grad_norm": 8.765074729919434, "learning_rate": 2.4957666955311217e-07, "loss": 0.0972, "step": 990 }, { "epoch": 0.11828720132481665, "grad_norm": 6.881076335906982, "learning_rate": 2.499403165010301e-07, "loss": 0.0992, "step": 1000 }, { "epoch": 0.11947007333806482, "grad_norm": 7.297924995422363, "learning_rate": 2.5030034501134775e-07, "loss": 0.096, "step": 1010 }, { "epoch": 0.12065294535131299, "grad_norm": 6.577905178070068, "learning_rate": 2.5065682638507576e-07, "loss": 0.0977, "step": 1020 }, { "epoch": 0.12183581736456116, "grad_norm": 6.630699634552002, "learning_rate": 2.5100982983629864e-07, "loss": 0.0842, "step": 1030 }, { "epoch": 0.12301868937780933, "grad_norm": 7.502241134643555, "learning_rate": 2.513594225728323e-07, "loss": 0.091, "step": 1040 }, { "epoch": 0.1242015613910575, "grad_norm": 7.249358177185059, "learning_rate": 2.517056698730219e-07, "loss": 0.087, "step": 1050 }, { "epoch": 0.12538443340430566, "grad_norm": 7.205135345458984, "learning_rate": 2.5204863515889984e-07, "loss": 0.09, "step": 1060 }, { "epoch": 0.12656730541755382, "grad_norm": 5.302363872528076, "learning_rate": 2.5238838006590917e-07, "loss": 0.0986, "step": 1070 }, { "epoch": 0.127750177430802, "grad_norm": 7.220069408416748, "learning_rate": 2.5272496450938387e-07, "loss": 0.0996, "step": 1080 }, { "epoch": 0.12893304944405015, "grad_norm": 6.973414421081543, "learning_rate": 2.5305844674796497e-07, "loss": 0.0986, "step": 1090 }, { "epoch": 0.13011592145729833, "grad_norm": 7.011667728424072, "learning_rate": 2.533888834441215e-07, "loss": 0.0901, "step": 1100 }, { "epoch": 0.1312987934705465, "grad_norm": 6.291499137878418, "learning_rate": 2.537163297219323e-07, "loss": 0.0962, "step": 1110 }, { "epoch": 0.13248166548379464, "grad_norm": 4.1603498458862305, "learning_rate": 2.540408392222768e-07, "loss": 0.0872, "step": 1120 }, { "epoch": 0.13366453749704282, "grad_norm": 5.578655242919922, "learning_rate": 2.5436246415557274e-07, "loss": 0.087, "step": 1130 }, { "epoch": 0.13484740951029098, "grad_norm": 6.366602897644043, "learning_rate": 2.546812553521908e-07, "loss": 0.0897, "step": 1140 }, { "epoch": 0.13603028152353916, "grad_norm": 4.836014747619629, "learning_rate": 2.5499726231066703e-07, "loss": 0.0887, "step": 1150 }, { "epoch": 0.1372131535367873, "grad_norm": 4.914805889129639, "learning_rate": 2.553105332438288e-07, "loss": 0.0954, "step": 1160 }, { "epoch": 0.1383960255500355, "grad_norm": 4.782810211181641, "learning_rate": 2.556211151229405e-07, "loss": 0.0921, "step": 1170 }, { "epoch": 0.13957889756328365, "grad_norm": 4.402538299560547, "learning_rate": 2.559290537199709e-07, "loss": 0.0902, "step": 1180 }, { "epoch": 0.14076176957653183, "grad_norm": 6.9891815185546875, "learning_rate": 2.5623439364807683e-07, "loss": 0.0854, "step": 1190 }, { "epoch": 0.14194464158977999, "grad_norm": 11.530652046203613, "learning_rate": 2.565371784003932e-07, "loss": 0.0985, "step": 1200 }, { "epoch": 0.14312751360302814, "grad_norm": 6.175468444824219, "learning_rate": 2.5683745038721294e-07, "loss": 0.0998, "step": 1210 }, { "epoch": 0.14431038561627632, "grad_norm": 5.1096014976501465, "learning_rate": 2.5713525097163744e-07, "loss": 0.0889, "step": 1220 }, { "epoch": 0.14549325762952448, "grad_norm": 9.522542953491211, "learning_rate": 2.574306205037713e-07, "loss": 0.0891, "step": 1230 }, { "epoch": 0.14667612964277266, "grad_norm": 6.2208571434021, "learning_rate": 2.57723598353533e-07, "loss": 0.0891, "step": 1240 }, { "epoch": 0.1478590016560208, "grad_norm": 12.990601539611816, "learning_rate": 2.5801422294214763e-07, "loss": 0.0954, "step": 1250 }, { "epoch": 0.149041873669269, "grad_norm": 4.334107398986816, "learning_rate": 2.58302531772385e-07, "loss": 0.0951, "step": 1260 }, { "epoch": 0.15022474568251715, "grad_norm": 4.620918273925781, "learning_rate": 2.585885614576023e-07, "loss": 0.0868, "step": 1270 }, { "epoch": 0.15140761769576533, "grad_norm": 6.769965171813965, "learning_rate": 2.588723477496481e-07, "loss": 0.0951, "step": 1280 }, { "epoch": 0.15259048970901348, "grad_norm": 5.419663429260254, "learning_rate": 2.591539255656806e-07, "loss": 0.0832, "step": 1290 }, { "epoch": 0.15377336172226166, "grad_norm": 3.968998908996582, "learning_rate": 2.5943332901394983e-07, "loss": 0.0858, "step": 1300 }, { "epoch": 0.15495623373550982, "grad_norm": 4.266129016876221, "learning_rate": 2.5971059141859263e-07, "loss": 0.1064, "step": 1310 }, { "epoch": 0.15613910574875797, "grad_norm": 3.961754560470581, "learning_rate": 2.5998574534348463e-07, "loss": 0.0945, "step": 1320 }, { "epoch": 0.15732197776200615, "grad_norm": 9.148289680480957, "learning_rate": 2.602588226151919e-07, "loss": 0.0874, "step": 1330 }, { "epoch": 0.1585048497752543, "grad_norm": 5.3835062980651855, "learning_rate": 2.605298543450633e-07, "loss": 0.0885, "step": 1340 }, { "epoch": 0.1596877217885025, "grad_norm": 6.187718868255615, "learning_rate": 2.607988709505014e-07, "loss": 0.0802, "step": 1350 }, { "epoch": 0.16087059380175064, "grad_norm": 4.033183574676514, "learning_rate": 2.6106590217544816e-07, "loss": 0.0957, "step": 1360 }, { "epoch": 0.16205346581499883, "grad_norm": 6.290068626403809, "learning_rate": 2.6133097711012103e-07, "loss": 0.0868, "step": 1370 }, { "epoch": 0.16323633782824698, "grad_norm": 6.378200531005859, "learning_rate": 2.615941242100301e-07, "loss": 0.0919, "step": 1380 }, { "epoch": 0.16441920984149516, "grad_norm": 6.864925384521484, "learning_rate": 2.6185537131431014e-07, "loss": 0.0884, "step": 1390 }, { "epoch": 0.16560208185474332, "grad_norm": 6.213853359222412, "learning_rate": 2.621147456633943e-07, "loss": 0.0835, "step": 1400 }, { "epoch": 0.16678495386799147, "grad_norm": 5.606426239013672, "learning_rate": 2.6237227391605965e-07, "loss": 0.0934, "step": 1410 }, { "epoch": 0.16796782588123965, "grad_norm": 3.546708583831787, "learning_rate": 2.626279821658698e-07, "loss": 0.0869, "step": 1420 }, { "epoch": 0.1691506978944878, "grad_norm": 5.095823287963867, "learning_rate": 2.628818959570412e-07, "loss": 0.0859, "step": 1430 }, { "epoch": 0.170333569907736, "grad_norm": 6.156503200531006, "learning_rate": 2.631340402997563e-07, "loss": 0.0921, "step": 1440 }, { "epoch": 0.17151644192098414, "grad_norm": 4.071560859680176, "learning_rate": 2.633844396849463e-07, "loss": 0.0941, "step": 1450 }, { "epoch": 0.17269931393423232, "grad_norm": 4.59564208984375, "learning_rate": 2.636331180985669e-07, "loss": 0.0964, "step": 1460 }, { "epoch": 0.17388218594748048, "grad_norm": 4.265997409820557, "learning_rate": 2.638800990353861e-07, "loss": 0.092, "step": 1470 }, { "epoch": 0.17506505796072866, "grad_norm": 3.987760543823242, "learning_rate": 2.641254055123048e-07, "loss": 0.0931, "step": 1480 }, { "epoch": 0.1762479299739768, "grad_norm": 3.4393296241760254, "learning_rate": 2.6436906008122883e-07, "loss": 0.0833, "step": 1490 }, { "epoch": 0.177430801987225, "grad_norm": 3.3839104175567627, "learning_rate": 2.6461108484151077e-07, "loss": 0.0897, "step": 1500 }, { "epoch": 0.17861367400047315, "grad_norm": 7.4513444900512695, "learning_rate": 2.6485150145197777e-07, "loss": 0.0922, "step": 1510 }, { "epoch": 0.1797965460137213, "grad_norm": 8.25451374053955, "learning_rate": 2.6509033114256325e-07, "loss": 0.0953, "step": 1520 }, { "epoch": 0.18097941802696949, "grad_norm": 7.311097145080566, "learning_rate": 2.6532759472555637e-07, "loss": 0.0962, "step": 1530 }, { "epoch": 0.18216229004021764, "grad_norm": 6.9265851974487305, "learning_rate": 2.655633126064857e-07, "loss": 0.097, "step": 1540 }, { "epoch": 0.18334516205346582, "grad_norm": 4.762001991271973, "learning_rate": 2.6579750479465053e-07, "loss": 0.0915, "step": 1550 }, { "epoch": 0.18452803406671398, "grad_norm": 6.176040172576904, "learning_rate": 2.660301909133129e-07, "loss": 0.1008, "step": 1560 }, { "epoch": 0.18571090607996216, "grad_norm": 5.207606315612793, "learning_rate": 2.6626139020956534e-07, "loss": 0.0936, "step": 1570 }, { "epoch": 0.1868937780932103, "grad_norm": 4.642921447753906, "learning_rate": 2.664911215638838e-07, "loss": 0.0873, "step": 1580 }, { "epoch": 0.1880766501064585, "grad_norm": 5.509186267852783, "learning_rate": 2.6671940349938045e-07, "loss": 0.0863, "step": 1590 }, { "epoch": 0.18925952211970665, "grad_norm": 5.439513206481934, "learning_rate": 2.6694625419076565e-07, "loss": 0.0945, "step": 1600 }, { "epoch": 0.1904423941329548, "grad_norm": 4.10603666305542, "learning_rate": 2.6717169147303124e-07, "loss": 0.0954, "step": 1610 }, { "epoch": 0.19162526614620298, "grad_norm": 5.221415042877197, "learning_rate": 2.673957328498645e-07, "loss": 0.0876, "step": 1620 }, { "epoch": 0.19280813815945114, "grad_norm": 5.306149482727051, "learning_rate": 2.6761839550180366e-07, "loss": 0.0919, "step": 1630 }, { "epoch": 0.19399101017269932, "grad_norm": 6.133793354034424, "learning_rate": 2.6783969629414374e-07, "loss": 0.0886, "step": 1640 }, { "epoch": 0.19517388218594747, "grad_norm": 3.1125082969665527, "learning_rate": 2.6805965178460216e-07, "loss": 0.0789, "step": 1650 }, { "epoch": 0.19635675419919565, "grad_norm": 6.0536041259765625, "learning_rate": 2.6827827823075276e-07, "loss": 0.0922, "step": 1660 }, { "epoch": 0.1975396262124438, "grad_norm": 5.956566333770752, "learning_rate": 2.684955915972366e-07, "loss": 0.1, "step": 1670 }, { "epoch": 0.198722498225692, "grad_norm": 4.950603008270264, "learning_rate": 2.687116075627574e-07, "loss": 0.0825, "step": 1680 }, { "epoch": 0.19990537023894014, "grad_norm": 9.585925102233887, "learning_rate": 2.6892634152686956e-07, "loss": 0.0879, "step": 1690 }, { "epoch": 0.20108824225218833, "grad_norm": 4.9892072677612305, "learning_rate": 2.691398086165657e-07, "loss": 0.0848, "step": 1700 }, { "epoch": 0.20227111426543648, "grad_norm": 6.892628192901611, "learning_rate": 2.693520236926714e-07, "loss": 0.0924, "step": 1710 }, { "epoch": 0.20345398627868463, "grad_norm": 4.501282691955566, "learning_rate": 2.6956300135605307e-07, "loss": 0.0933, "step": 1720 }, { "epoch": 0.20463685829193282, "grad_norm": 5.998622894287109, "learning_rate": 2.697727559536461e-07, "loss": 0.0919, "step": 1730 }, { "epoch": 0.20581973030518097, "grad_norm": 5.121301651000977, "learning_rate": 2.699813015843094e-07, "loss": 0.0866, "step": 1740 }, { "epoch": 0.20700260231842915, "grad_norm": 5.390039920806885, "learning_rate": 2.7018865210451184e-07, "loss": 0.0858, "step": 1750 }, { "epoch": 0.2081854743316773, "grad_norm": 5.33758544921875, "learning_rate": 2.703948211338571e-07, "loss": 0.0972, "step": 1760 }, { "epoch": 0.2093683463449255, "grad_norm": 6.311947822570801, "learning_rate": 2.705998220604515e-07, "loss": 0.099, "step": 1770 }, { "epoch": 0.21055121835817364, "grad_norm": 4.473199844360352, "learning_rate": 2.7080366804612135e-07, "loss": 0.0924, "step": 1780 }, { "epoch": 0.21173409037142182, "grad_norm": 7.230584621429443, "learning_rate": 2.710063720314832e-07, "loss": 0.087, "step": 1790 }, { "epoch": 0.21291696238466998, "grad_norm": 4.157140731811523, "learning_rate": 2.7120794674087385e-07, "loss": 0.0853, "step": 1800 }, { "epoch": 0.21409983439791813, "grad_norm": 5.474308490753174, "learning_rate": 2.714084046871429e-07, "loss": 0.0952, "step": 1810 }, { "epoch": 0.2152827064111663, "grad_norm": 4.608627796173096, "learning_rate": 2.7160775817631404e-07, "loss": 0.0809, "step": 1820 }, { "epoch": 0.21646557842441447, "grad_norm": 3.6023459434509277, "learning_rate": 2.7180601931211804e-07, "loss": 0.0827, "step": 1830 }, { "epoch": 0.21764845043766265, "grad_norm": 5.581504821777344, "learning_rate": 2.720032000004025e-07, "loss": 0.0858, "step": 1840 }, { "epoch": 0.2188313224509108, "grad_norm": 4.189643383026123, "learning_rate": 2.7219931195342225e-07, "loss": 0.0859, "step": 1850 }, { "epoch": 0.22001419446415899, "grad_norm": 4.908015727996826, "learning_rate": 2.723943666940136e-07, "loss": 0.096, "step": 1860 }, { "epoch": 0.22119706647740714, "grad_norm": 3.7516977787017822, "learning_rate": 2.7258837555965713e-07, "loss": 0.0904, "step": 1870 }, { "epoch": 0.22237993849065532, "grad_norm": 4.31256628036499, "learning_rate": 2.727813497064321e-07, "loss": 0.0943, "step": 1880 }, { "epoch": 0.22356281050390348, "grad_norm": 8.408005714416504, "learning_rate": 2.729733001128656e-07, "loss": 0.0811, "step": 1890 }, { "epoch": 0.22474568251715166, "grad_norm": 4.453146457672119, "learning_rate": 2.731642375836808e-07, "loss": 0.0842, "step": 1900 }, { "epoch": 0.2259285545303998, "grad_norm": 5.534027576446533, "learning_rate": 2.7335417275344585e-07, "loss": 0.088, "step": 1910 }, { "epoch": 0.22711142654364797, "grad_norm": 3.421473264694214, "learning_rate": 2.735431160901288e-07, "loss": 0.087, "step": 1920 }, { "epoch": 0.22829429855689615, "grad_norm": 5.357019424438477, "learning_rate": 2.737310778985586e-07, "loss": 0.0904, "step": 1930 }, { "epoch": 0.2294771705701443, "grad_norm": 5.758317470550537, "learning_rate": 2.7391806832379834e-07, "loss": 0.0989, "step": 1940 }, { "epoch": 0.23066004258339248, "grad_norm": 16.539594650268555, "learning_rate": 2.741040973544305e-07, "loss": 0.0873, "step": 1950 }, { "epoch": 0.23184291459664064, "grad_norm": 4.858366012573242, "learning_rate": 2.742891748257585e-07, "loss": 0.0963, "step": 1960 }, { "epoch": 0.23302578660988882, "grad_norm": 4.561233043670654, "learning_rate": 2.744733104229276e-07, "loss": 0.0863, "step": 1970 }, { "epoch": 0.23420865862313697, "grad_norm": 5.5304741859436035, "learning_rate": 2.7465651368396524e-07, "loss": 0.0855, "step": 1980 }, { "epoch": 0.23539153063638515, "grad_norm": 5.525140285491943, "learning_rate": 2.748387940027463e-07, "loss": 0.0813, "step": 1990 }, { "epoch": 0.2365744026496333, "grad_norm": 4.4344377517700195, "learning_rate": 2.7502016063188317e-07, "loss": 0.0724, "step": 2000 }, { "epoch": 0.23775727466288146, "grad_norm": 5.21195125579834, "learning_rate": 2.7520062268554395e-07, "loss": 0.1001, "step": 2010 }, { "epoch": 0.23894014667612964, "grad_norm": 3.833998441696167, "learning_rate": 2.753801891422008e-07, "loss": 0.085, "step": 2020 }, { "epoch": 0.2401230186893778, "grad_norm": 4.344178199768066, "learning_rate": 2.755588688473105e-07, "loss": 0.0846, "step": 2030 }, { "epoch": 0.24130589070262598, "grad_norm": 3.7756147384643555, "learning_rate": 2.757366705159288e-07, "loss": 0.0785, "step": 2040 }, { "epoch": 0.24248876271587413, "grad_norm": 4.346851348876953, "learning_rate": 2.759136027352612e-07, "loss": 0.0788, "step": 2050 }, { "epoch": 0.24367163472912232, "grad_norm": 3.319274425506592, "learning_rate": 2.760896739671517e-07, "loss": 0.0878, "step": 2060 }, { "epoch": 0.24485450674237047, "grad_norm": 6.814015865325928, "learning_rate": 2.762648925505107e-07, "loss": 0.0838, "step": 2070 }, { "epoch": 0.24603737875561865, "grad_norm": 4.570865631103516, "learning_rate": 2.7643926670368537e-07, "loss": 0.0799, "step": 2080 }, { "epoch": 0.2472202507688668, "grad_norm": 5.0868940353393555, "learning_rate": 2.7661280452677217e-07, "loss": 0.0733, "step": 2090 }, { "epoch": 0.248403122782115, "grad_norm": 3.0565061569213867, "learning_rate": 2.76785514003875e-07, "loss": 0.084, "step": 2100 }, { "epoch": 0.24958599479536314, "grad_norm": 3.6972484588623047, "learning_rate": 2.7695740300530895e-07, "loss": 0.0943, "step": 2110 }, { "epoch": 0.2507688668086113, "grad_norm": 3.02738881111145, "learning_rate": 2.771284792897529e-07, "loss": 0.0852, "step": 2120 }, { "epoch": 0.25195173882185945, "grad_norm": 6.243197441101074, "learning_rate": 2.7729875050635036e-07, "loss": 0.0855, "step": 2130 }, { "epoch": 0.25313461083510763, "grad_norm": 4.515888690948486, "learning_rate": 2.7746822419676223e-07, "loss": 0.0844, "step": 2140 }, { "epoch": 0.2543174828483558, "grad_norm": 8.201851844787598, "learning_rate": 2.776369077971706e-07, "loss": 0.0814, "step": 2150 }, { "epoch": 0.255500354861604, "grad_norm": 3.712568521499634, "learning_rate": 2.7780480864023693e-07, "loss": 0.0849, "step": 2160 }, { "epoch": 0.2566832268748521, "grad_norm": 3.514626979827881, "learning_rate": 2.7797193395701474e-07, "loss": 0.0894, "step": 2170 }, { "epoch": 0.2578660988881003, "grad_norm": 5.7424516677856445, "learning_rate": 2.7813829087881804e-07, "loss": 0.0825, "step": 2180 }, { "epoch": 0.2590489709013485, "grad_norm": 5.844192028045654, "learning_rate": 2.783038864390475e-07, "loss": 0.0888, "step": 2190 }, { "epoch": 0.26023184291459667, "grad_norm": 5.052335739135742, "learning_rate": 2.7846872757497456e-07, "loss": 0.0903, "step": 2200 }, { "epoch": 0.2614147149278448, "grad_norm": 6.066333770751953, "learning_rate": 2.786328211294854e-07, "loss": 0.0918, "step": 2210 }, { "epoch": 0.262597586941093, "grad_norm": 3.7724835872650146, "learning_rate": 2.787961738527854e-07, "loss": 0.0884, "step": 2220 }, { "epoch": 0.26378045895434116, "grad_norm": 3.7617435455322266, "learning_rate": 2.789587924040652e-07, "loss": 0.0831, "step": 2230 }, { "epoch": 0.2649633309675893, "grad_norm": 6.675604343414307, "learning_rate": 2.7912068335312985e-07, "loss": 0.0774, "step": 2240 }, { "epoch": 0.26614620298083747, "grad_norm": 4.29455041885376, "learning_rate": 2.792818531819914e-07, "loss": 0.0809, "step": 2250 }, { "epoch": 0.26732907499408565, "grad_norm": 2.7277703285217285, "learning_rate": 2.794423082864258e-07, "loss": 0.0915, "step": 2260 }, { "epoch": 0.26851194700733383, "grad_norm": 3.5644021034240723, "learning_rate": 2.7960205497749664e-07, "loss": 0.086, "step": 2270 }, { "epoch": 0.26969481902058196, "grad_norm": 5.203859806060791, "learning_rate": 2.7976109948304386e-07, "loss": 0.084, "step": 2280 }, { "epoch": 0.27087769103383014, "grad_norm": 5.591561794281006, "learning_rate": 2.799194479491409e-07, "loss": 0.0789, "step": 2290 }, { "epoch": 0.2720605630470783, "grad_norm": 4.48732852935791, "learning_rate": 2.800771064415201e-07, "loss": 0.0829, "step": 2300 }, { "epoch": 0.27324343506032645, "grad_norm": 3.2346603870391846, "learning_rate": 2.8023408094696636e-07, "loss": 0.0948, "step": 2310 }, { "epoch": 0.2744263070735746, "grad_norm": 3.780332088470459, "learning_rate": 2.803903773746819e-07, "loss": 0.0923, "step": 2320 }, { "epoch": 0.2756091790868228, "grad_norm": 6.343169212341309, "learning_rate": 2.8054600155762027e-07, "loss": 0.0826, "step": 2330 }, { "epoch": 0.276792051100071, "grad_norm": 7.227244853973389, "learning_rate": 2.807009592537935e-07, "loss": 0.084, "step": 2340 }, { "epoch": 0.2779749231133191, "grad_norm": 3.869154930114746, "learning_rate": 2.808552561475496e-07, "loss": 0.084, "step": 2350 }, { "epoch": 0.2791577951265673, "grad_norm": 3.361931324005127, "learning_rate": 2.8100889785082397e-07, "loss": 0.0876, "step": 2360 }, { "epoch": 0.2803406671398155, "grad_norm": 4.093608856201172, "learning_rate": 2.811618899043644e-07, "loss": 0.0863, "step": 2370 }, { "epoch": 0.28152353915306366, "grad_norm": 4.153050422668457, "learning_rate": 2.813142377789299e-07, "loss": 0.0916, "step": 2380 }, { "epoch": 0.2827064111663118, "grad_norm": 4.637054443359375, "learning_rate": 2.8146594687646443e-07, "loss": 0.0849, "step": 2390 }, { "epoch": 0.28388928317955997, "grad_norm": 3.128478527069092, "learning_rate": 2.8161702253124625e-07, "loss": 0.0813, "step": 2400 }, { "epoch": 0.28507215519280815, "grad_norm": 4.976253032684326, "learning_rate": 2.817674700110135e-07, "loss": 0.0924, "step": 2410 }, { "epoch": 0.2862550272060563, "grad_norm": 4.830815315246582, "learning_rate": 2.81917294518066e-07, "loss": 0.0865, "step": 2420 }, { "epoch": 0.28743789921930446, "grad_norm": 5.575986862182617, "learning_rate": 2.820665011903451e-07, "loss": 0.091, "step": 2430 }, { "epoch": 0.28862077123255264, "grad_norm": 6.756980895996094, "learning_rate": 2.8221509510249045e-07, "loss": 0.0848, "step": 2440 }, { "epoch": 0.2898036432458008, "grad_norm": 3.78298282623291, "learning_rate": 2.8236308126687605e-07, "loss": 0.0876, "step": 2450 }, { "epoch": 0.29098651525904895, "grad_norm": 3.41170597076416, "learning_rate": 2.8251046463462434e-07, "loss": 0.0799, "step": 2460 }, { "epoch": 0.29216938727229713, "grad_norm": 4.09858512878418, "learning_rate": 2.8265725009660045e-07, "loss": 0.0883, "step": 2470 }, { "epoch": 0.2933522592855453, "grad_norm": 5.9748992919921875, "learning_rate": 2.82803442484386e-07, "loss": 0.0852, "step": 2480 }, { "epoch": 0.2945351312987935, "grad_norm": 4.529118537902832, "learning_rate": 2.8294904657123336e-07, "loss": 0.0868, "step": 2490 }, { "epoch": 0.2957180033120416, "grad_norm": 3.11757755279541, "learning_rate": 2.830940670730007e-07, "loss": 0.0832, "step": 2500 }, { "epoch": 0.2969008753252898, "grad_norm": 2.7838950157165527, "learning_rate": 2.832385086490685e-07, "loss": 0.0846, "step": 2510 }, { "epoch": 0.298083747338538, "grad_norm": 5.538972854614258, "learning_rate": 2.8338237590323806e-07, "loss": 0.0836, "step": 2520 }, { "epoch": 0.2992666193517861, "grad_norm": 4.560507774353027, "learning_rate": 2.8352567338461153e-07, "loss": 0.0911, "step": 2530 }, { "epoch": 0.3004494913650343, "grad_norm": 3.7603893280029297, "learning_rate": 2.8360079282377696e-07, "loss": 0.0823, "step": 2540 }, { "epoch": 0.3016323633782825, "grad_norm": 3.9844887256622314, "learning_rate": 2.8347653186210506e-07, "loss": 0.0814, "step": 2550 }, { "epoch": 0.30281523539153066, "grad_norm": 3.134514570236206, "learning_rate": 2.8335227090043317e-07, "loss": 0.0884, "step": 2560 }, { "epoch": 0.3039981074047788, "grad_norm": 3.922294855117798, "learning_rate": 2.832280099387613e-07, "loss": 0.0821, "step": 2570 }, { "epoch": 0.30518097941802697, "grad_norm": 4.164619445800781, "learning_rate": 2.831037489770894e-07, "loss": 0.0844, "step": 2580 }, { "epoch": 0.30636385143127515, "grad_norm": 5.076725959777832, "learning_rate": 2.8297948801541743e-07, "loss": 0.0863, "step": 2590 }, { "epoch": 0.30754672344452333, "grad_norm": 2.610517978668213, "learning_rate": 2.828552270537456e-07, "loss": 0.0897, "step": 2600 }, { "epoch": 0.30872959545777146, "grad_norm": 4.391855716705322, "learning_rate": 2.827309660920737e-07, "loss": 0.0793, "step": 2610 }, { "epoch": 0.30991246747101964, "grad_norm": 3.9324872493743896, "learning_rate": 2.8260670513040175e-07, "loss": 0.0883, "step": 2620 }, { "epoch": 0.3110953394842678, "grad_norm": 4.361207962036133, "learning_rate": 2.8248244416872986e-07, "loss": 0.0816, "step": 2630 }, { "epoch": 0.31227821149751595, "grad_norm": 5.000744342803955, "learning_rate": 2.8235818320705797e-07, "loss": 0.0861, "step": 2640 }, { "epoch": 0.3134610835107641, "grad_norm": 2.776132822036743, "learning_rate": 2.8223392224538607e-07, "loss": 0.0752, "step": 2650 }, { "epoch": 0.3146439555240123, "grad_norm": 3.8075029850006104, "learning_rate": 2.821096612837142e-07, "loss": 0.0798, "step": 2660 }, { "epoch": 0.3158268275372605, "grad_norm": 3.226005792617798, "learning_rate": 2.8198540032204223e-07, "loss": 0.0923, "step": 2670 }, { "epoch": 0.3170096995505086, "grad_norm": 3.0657901763916016, "learning_rate": 2.818611393603704e-07, "loss": 0.0863, "step": 2680 }, { "epoch": 0.3181925715637568, "grad_norm": 3.3312923908233643, "learning_rate": 2.817368783986985e-07, "loss": 0.0839, "step": 2690 }, { "epoch": 0.319375443577005, "grad_norm": 3.0238230228424072, "learning_rate": 2.8161261743702655e-07, "loss": 0.0834, "step": 2700 }, { "epoch": 0.32055831559025316, "grad_norm": 3.247262716293335, "learning_rate": 2.8148835647535466e-07, "loss": 0.0823, "step": 2710 }, { "epoch": 0.3217411876035013, "grad_norm": 4.298614025115967, "learning_rate": 2.8136409551368276e-07, "loss": 0.0807, "step": 2720 }, { "epoch": 0.32292405961674947, "grad_norm": 5.09541654586792, "learning_rate": 2.8123983455201087e-07, "loss": 0.0876, "step": 2730 }, { "epoch": 0.32410693162999765, "grad_norm": 5.008078575134277, "learning_rate": 2.81115573590339e-07, "loss": 0.0823, "step": 2740 }, { "epoch": 0.3252898036432458, "grad_norm": 4.176627159118652, "learning_rate": 2.8099131262866703e-07, "loss": 0.0877, "step": 2750 }, { "epoch": 0.32647267565649396, "grad_norm": 5.339503765106201, "learning_rate": 2.808670516669952e-07, "loss": 0.0857, "step": 2760 }, { "epoch": 0.32765554766974214, "grad_norm": 3.2683210372924805, "learning_rate": 2.807427907053233e-07, "loss": 0.0832, "step": 2770 }, { "epoch": 0.3288384196829903, "grad_norm": 4.670903205871582, "learning_rate": 2.806185297436514e-07, "loss": 0.0866, "step": 2780 }, { "epoch": 0.33002129169623845, "grad_norm": 3.791898727416992, "learning_rate": 2.8049426878197946e-07, "loss": 0.0832, "step": 2790 }, { "epoch": 0.33120416370948663, "grad_norm": 4.271515846252441, "learning_rate": 2.8037000782030756e-07, "loss": 0.0927, "step": 2800 }, { "epoch": 0.3323870357227348, "grad_norm": 3.2368903160095215, "learning_rate": 2.8024574685863567e-07, "loss": 0.0817, "step": 2810 }, { "epoch": 0.33356990773598294, "grad_norm": 4.760076522827148, "learning_rate": 2.801214858969638e-07, "loss": 0.0835, "step": 2820 }, { "epoch": 0.3347527797492311, "grad_norm": 3.1779420375823975, "learning_rate": 2.799972249352919e-07, "loss": 0.0854, "step": 2830 }, { "epoch": 0.3359356517624793, "grad_norm": 5.107559680938721, "learning_rate": 2.7987296397361994e-07, "loss": 0.0854, "step": 2840 }, { "epoch": 0.3371185237757275, "grad_norm": 5.073968887329102, "learning_rate": 2.797487030119481e-07, "loss": 0.0751, "step": 2850 }, { "epoch": 0.3383013957889756, "grad_norm": 5.731428146362305, "learning_rate": 2.796244420502762e-07, "loss": 0.0835, "step": 2860 }, { "epoch": 0.3394842678022238, "grad_norm": 4.343652725219727, "learning_rate": 2.7950018108860425e-07, "loss": 0.0912, "step": 2870 }, { "epoch": 0.340667139815472, "grad_norm": 3.8376545906066895, "learning_rate": 2.7937592012693236e-07, "loss": 0.0828, "step": 2880 }, { "epoch": 0.34185001182872016, "grad_norm": 4.390218734741211, "learning_rate": 2.7925165916526047e-07, "loss": 0.0788, "step": 2890 }, { "epoch": 0.3430328838419683, "grad_norm": 4.639183044433594, "learning_rate": 2.791273982035886e-07, "loss": 0.0819, "step": 2900 }, { "epoch": 0.34421575585521647, "grad_norm": 2.9175424575805664, "learning_rate": 2.790031372419167e-07, "loss": 0.0931, "step": 2910 }, { "epoch": 0.34539862786846465, "grad_norm": 4.009083271026611, "learning_rate": 2.7887887628024473e-07, "loss": 0.0872, "step": 2920 }, { "epoch": 0.3465814998817128, "grad_norm": 3.8458118438720703, "learning_rate": 2.787546153185729e-07, "loss": 0.092, "step": 2930 }, { "epoch": 0.34776437189496096, "grad_norm": 2.746005058288574, "learning_rate": 2.78630354356901e-07, "loss": 0.0814, "step": 2940 }, { "epoch": 0.34894724390820914, "grad_norm": 3.092203378677368, "learning_rate": 2.7850609339522905e-07, "loss": 0.0863, "step": 2950 }, { "epoch": 0.3501301159214573, "grad_norm": 3.028773546218872, "learning_rate": 2.7838183243355716e-07, "loss": 0.0905, "step": 2960 }, { "epoch": 0.35131298793470545, "grad_norm": 3.5865437984466553, "learning_rate": 2.7825757147188527e-07, "loss": 0.0781, "step": 2970 }, { "epoch": 0.3524958599479536, "grad_norm": 3.8336100578308105, "learning_rate": 2.7813331051021337e-07, "loss": 0.0887, "step": 2980 }, { "epoch": 0.3536787319612018, "grad_norm": 4.474883556365967, "learning_rate": 2.780090495485415e-07, "loss": 0.0818, "step": 2990 }, { "epoch": 0.35486160397445, "grad_norm": 2.997204065322876, "learning_rate": 2.778847885868696e-07, "loss": 0.0833, "step": 3000 }, { "epoch": 0.3560444759876981, "grad_norm": 6.087620735168457, "learning_rate": 2.777605276251977e-07, "loss": 0.0866, "step": 3010 }, { "epoch": 0.3572273480009463, "grad_norm": 4.276342868804932, "learning_rate": 2.776362666635258e-07, "loss": 0.0758, "step": 3020 }, { "epoch": 0.3584102200141945, "grad_norm": 3.2080841064453125, "learning_rate": 2.775120057018539e-07, "loss": 0.0863, "step": 3030 }, { "epoch": 0.3595930920274426, "grad_norm": 6.177104949951172, "learning_rate": 2.7738774474018196e-07, "loss": 0.0772, "step": 3040 }, { "epoch": 0.3607759640406908, "grad_norm": 4.985621929168701, "learning_rate": 2.7726348377851006e-07, "loss": 0.0829, "step": 3050 }, { "epoch": 0.36195883605393897, "grad_norm": 4.600679397583008, "learning_rate": 2.7713922281683817e-07, "loss": 0.0988, "step": 3060 }, { "epoch": 0.36314170806718715, "grad_norm": 2.7192142009735107, "learning_rate": 2.770149618551663e-07, "loss": 0.0817, "step": 3070 }, { "epoch": 0.3643245800804353, "grad_norm": 3.5331640243530273, "learning_rate": 2.768907008934944e-07, "loss": 0.081, "step": 3080 }, { "epoch": 0.36550745209368346, "grad_norm": 4.6768107414245605, "learning_rate": 2.767664399318225e-07, "loss": 0.0833, "step": 3090 }, { "epoch": 0.36669032410693164, "grad_norm": 5.195780277252197, "learning_rate": 2.766421789701506e-07, "loss": 0.0845, "step": 3100 }, { "epoch": 0.3678731961201798, "grad_norm": 3.4562463760375977, "learning_rate": 2.765179180084787e-07, "loss": 0.0937, "step": 3110 }, { "epoch": 0.36905606813342795, "grad_norm": 4.305200576782227, "learning_rate": 2.7639365704680676e-07, "loss": 0.0916, "step": 3120 }, { "epoch": 0.37023894014667613, "grad_norm": 4.819878101348877, "learning_rate": 2.7626939608513486e-07, "loss": 0.0834, "step": 3130 }, { "epoch": 0.3714218121599243, "grad_norm": 3.3945224285125732, "learning_rate": 2.7614513512346297e-07, "loss": 0.0828, "step": 3140 }, { "epoch": 0.37260468417317244, "grad_norm": 2.873474597930908, "learning_rate": 2.760208741617911e-07, "loss": 0.072, "step": 3150 }, { "epoch": 0.3737875561864206, "grad_norm": 4.781952857971191, "learning_rate": 2.758966132001192e-07, "loss": 0.0923, "step": 3160 }, { "epoch": 0.3749704281996688, "grad_norm": 4.475467681884766, "learning_rate": 2.7577235223844723e-07, "loss": 0.0872, "step": 3170 }, { "epoch": 0.376153300212917, "grad_norm": 4.357288360595703, "learning_rate": 2.756480912767754e-07, "loss": 0.0841, "step": 3180 }, { "epoch": 0.3773361722261651, "grad_norm": 3.5623536109924316, "learning_rate": 2.755238303151035e-07, "loss": 0.0764, "step": 3190 }, { "epoch": 0.3785190442394133, "grad_norm": 4.803061008453369, "learning_rate": 2.753995693534316e-07, "loss": 0.082, "step": 3200 }, { "epoch": 0.3797019162526615, "grad_norm": 3.8007302284240723, "learning_rate": 2.7527530839175966e-07, "loss": 0.0942, "step": 3210 }, { "epoch": 0.3808847882659096, "grad_norm": 3.3903470039367676, "learning_rate": 2.7515104743008777e-07, "loss": 0.0817, "step": 3220 }, { "epoch": 0.3820676602791578, "grad_norm": 3.919443130493164, "learning_rate": 2.7502678646841587e-07, "loss": 0.0772, "step": 3230 }, { "epoch": 0.38325053229240597, "grad_norm": 2.981738567352295, "learning_rate": 2.74902525506744e-07, "loss": 0.0883, "step": 3240 }, { "epoch": 0.38443340430565415, "grad_norm": 3.401416301727295, "learning_rate": 2.747782645450721e-07, "loss": 0.085, "step": 3250 }, { "epoch": 0.3856162763189023, "grad_norm": 4.379512310028076, "learning_rate": 2.746540035834002e-07, "loss": 0.0924, "step": 3260 }, { "epoch": 0.38679914833215046, "grad_norm": 2.8705315589904785, "learning_rate": 2.745297426217283e-07, "loss": 0.0866, "step": 3270 }, { "epoch": 0.38798202034539864, "grad_norm": 4.05458927154541, "learning_rate": 2.744054816600564e-07, "loss": 0.0836, "step": 3280 }, { "epoch": 0.3891648923586468, "grad_norm": 5.1050543785095215, "learning_rate": 2.7428122069838446e-07, "loss": 0.0858, "step": 3290 }, { "epoch": 0.39034776437189495, "grad_norm": 3.804508924484253, "learning_rate": 2.7415695973671257e-07, "loss": 0.0839, "step": 3300 }, { "epoch": 0.3915306363851431, "grad_norm": 3.081632375717163, "learning_rate": 2.7403269877504067e-07, "loss": 0.0799, "step": 3310 }, { "epoch": 0.3927135083983913, "grad_norm": 3.6459410190582275, "learning_rate": 2.739084378133688e-07, "loss": 0.092, "step": 3320 }, { "epoch": 0.39389638041163944, "grad_norm": 4.038046836853027, "learning_rate": 2.737841768516969e-07, "loss": 0.0878, "step": 3330 }, { "epoch": 0.3950792524248876, "grad_norm": 4.058537483215332, "learning_rate": 2.73659915890025e-07, "loss": 0.0787, "step": 3340 }, { "epoch": 0.3962621244381358, "grad_norm": 6.913951396942139, "learning_rate": 2.735356549283531e-07, "loss": 0.0783, "step": 3350 }, { "epoch": 0.397444996451384, "grad_norm": 2.281195878982544, "learning_rate": 2.734113939666812e-07, "loss": 0.0841, "step": 3360 }, { "epoch": 0.3986278684646321, "grad_norm": 5.088544845581055, "learning_rate": 2.7328713300500926e-07, "loss": 0.0863, "step": 3370 }, { "epoch": 0.3998107404778803, "grad_norm": 3.2367053031921387, "learning_rate": 2.7316287204333736e-07, "loss": 0.0863, "step": 3380 }, { "epoch": 0.40099361249112847, "grad_norm": 2.788651943206787, "learning_rate": 2.7303861108166547e-07, "loss": 0.079, "step": 3390 }, { "epoch": 0.40217648450437665, "grad_norm": 3.1775708198547363, "learning_rate": 2.729143501199936e-07, "loss": 0.0795, "step": 3400 }, { "epoch": 0.4033593565176248, "grad_norm": 1.9840484857559204, "learning_rate": 2.727900891583217e-07, "loss": 0.0907, "step": 3410 }, { "epoch": 0.40454222853087296, "grad_norm": 3.123171091079712, "learning_rate": 2.726658281966498e-07, "loss": 0.0838, "step": 3420 }, { "epoch": 0.40572510054412114, "grad_norm": 3.604736328125, "learning_rate": 2.725415672349779e-07, "loss": 0.0772, "step": 3430 }, { "epoch": 0.40690797255736927, "grad_norm": 3.479311466217041, "learning_rate": 2.72417306273306e-07, "loss": 0.0776, "step": 3440 }, { "epoch": 0.40809084457061745, "grad_norm": 3.187650442123413, "learning_rate": 2.722930453116341e-07, "loss": 0.0798, "step": 3450 }, { "epoch": 0.40927371658386563, "grad_norm": 2.5308213233947754, "learning_rate": 2.7216878434996216e-07, "loss": 0.0846, "step": 3460 }, { "epoch": 0.4104565885971138, "grad_norm": 5.219987869262695, "learning_rate": 2.7204452338829027e-07, "loss": 0.085, "step": 3470 }, { "epoch": 0.41163946061036194, "grad_norm": 5.175335884094238, "learning_rate": 2.719202624266184e-07, "loss": 0.0797, "step": 3480 }, { "epoch": 0.4128223326236101, "grad_norm": 3.493697166442871, "learning_rate": 2.717960014649465e-07, "loss": 0.0818, "step": 3490 }, { "epoch": 0.4140052046368583, "grad_norm": 3.486443519592285, "learning_rate": 2.716717405032746e-07, "loss": 0.076, "step": 3500 }, { "epoch": 0.4151880766501065, "grad_norm": 3.1171669960021973, "learning_rate": 2.715474795416027e-07, "loss": 0.0806, "step": 3510 }, { "epoch": 0.4163709486633546, "grad_norm": 3.8365890979766846, "learning_rate": 2.714232185799308e-07, "loss": 0.0837, "step": 3520 }, { "epoch": 0.4175538206766028, "grad_norm": 3.3638434410095215, "learning_rate": 2.712989576182589e-07, "loss": 0.0814, "step": 3530 }, { "epoch": 0.418736692689851, "grad_norm": 4.005792617797852, "learning_rate": 2.7117469665658696e-07, "loss": 0.079, "step": 3540 }, { "epoch": 0.4199195647030991, "grad_norm": 2.8239734172821045, "learning_rate": 2.7105043569491507e-07, "loss": 0.0876, "step": 3550 }, { "epoch": 0.4211024367163473, "grad_norm": 4.09285306930542, "learning_rate": 2.7092617473324317e-07, "loss": 0.0931, "step": 3560 }, { "epoch": 0.42228530872959547, "grad_norm": 3.0454795360565186, "learning_rate": 2.708019137715713e-07, "loss": 0.0925, "step": 3570 }, { "epoch": 0.42346818074284365, "grad_norm": 2.356950044631958, "learning_rate": 2.706776528098994e-07, "loss": 0.0763, "step": 3580 }, { "epoch": 0.4246510527560918, "grad_norm": 3.626080274581909, "learning_rate": 2.705533918482275e-07, "loss": 0.084, "step": 3590 }, { "epoch": 0.42583392476933996, "grad_norm": 2.284700632095337, "learning_rate": 2.704291308865556e-07, "loss": 0.0836, "step": 3600 }, { "epoch": 0.42701679678258814, "grad_norm": 3.8480172157287598, "learning_rate": 2.703048699248837e-07, "loss": 0.0941, "step": 3610 }, { "epoch": 0.42819966879583626, "grad_norm": 5.074581623077393, "learning_rate": 2.701806089632118e-07, "loss": 0.0868, "step": 3620 }, { "epoch": 0.42938254080908445, "grad_norm": 3.2415335178375244, "learning_rate": 2.7005634800153986e-07, "loss": 0.0788, "step": 3630 }, { "epoch": 0.4305654128223326, "grad_norm": 4.686880111694336, "learning_rate": 2.6993208703986797e-07, "loss": 0.0817, "step": 3640 }, { "epoch": 0.4317482848355808, "grad_norm": 3.968097686767578, "learning_rate": 2.698078260781961e-07, "loss": 0.0753, "step": 3650 }, { "epoch": 0.43293115684882894, "grad_norm": 4.052289962768555, "learning_rate": 2.696835651165242e-07, "loss": 0.086, "step": 3660 }, { "epoch": 0.4341140288620771, "grad_norm": 4.214913368225098, "learning_rate": 2.695593041548523e-07, "loss": 0.0881, "step": 3670 }, { "epoch": 0.4352969008753253, "grad_norm": 3.5761194229125977, "learning_rate": 2.694350431931804e-07, "loss": 0.0865, "step": 3680 }, { "epoch": 0.4364797728885735, "grad_norm": 4.45274543762207, "learning_rate": 2.693107822315085e-07, "loss": 0.083, "step": 3690 }, { "epoch": 0.4376626449018216, "grad_norm": 4.2090983390808105, "learning_rate": 2.691865212698366e-07, "loss": 0.0805, "step": 3700 }, { "epoch": 0.4388455169150698, "grad_norm": 2.9131274223327637, "learning_rate": 2.6906226030816466e-07, "loss": 0.0886, "step": 3710 }, { "epoch": 0.44002838892831797, "grad_norm": 3.30649995803833, "learning_rate": 2.6893799934649277e-07, "loss": 0.0895, "step": 3720 }, { "epoch": 0.4412112609415661, "grad_norm": 3.2177176475524902, "learning_rate": 2.688137383848209e-07, "loss": 0.0836, "step": 3730 }, { "epoch": 0.4423941329548143, "grad_norm": 3.677591562271118, "learning_rate": 2.68689477423149e-07, "loss": 0.0828, "step": 3740 }, { "epoch": 0.44357700496806246, "grad_norm": 2.8260738849639893, "learning_rate": 2.685652164614771e-07, "loss": 0.0817, "step": 3750 }, { "epoch": 0.44475987698131064, "grad_norm": 3.406869888305664, "learning_rate": 2.684409554998052e-07, "loss": 0.0766, "step": 3760 }, { "epoch": 0.44594274899455877, "grad_norm": 3.0069966316223145, "learning_rate": 2.683166945381333e-07, "loss": 0.0843, "step": 3770 }, { "epoch": 0.44712562100780695, "grad_norm": 3.14689302444458, "learning_rate": 2.681924335764614e-07, "loss": 0.0768, "step": 3780 }, { "epoch": 0.44830849302105513, "grad_norm": 3.049311876296997, "learning_rate": 2.6806817261478946e-07, "loss": 0.0796, "step": 3790 }, { "epoch": 0.4494913650343033, "grad_norm": 2.945707082748413, "learning_rate": 2.6794391165311757e-07, "loss": 0.0837, "step": 3800 }, { "epoch": 0.45067423704755144, "grad_norm": 2.58554744720459, "learning_rate": 2.6781965069144567e-07, "loss": 0.0951, "step": 3810 }, { "epoch": 0.4518571090607996, "grad_norm": 3.4531877040863037, "learning_rate": 2.676953897297738e-07, "loss": 0.0869, "step": 3820 }, { "epoch": 0.4530399810740478, "grad_norm": 2.6947646141052246, "learning_rate": 2.675711287681019e-07, "loss": 0.0748, "step": 3830 }, { "epoch": 0.45422285308729593, "grad_norm": 5.029610633850098, "learning_rate": 2.6744686780643e-07, "loss": 0.0826, "step": 3840 }, { "epoch": 0.4554057251005441, "grad_norm": 4.333353042602539, "learning_rate": 2.673226068447581e-07, "loss": 0.0748, "step": 3850 }, { "epoch": 0.4565885971137923, "grad_norm": 2.7484779357910156, "learning_rate": 2.671983458830862e-07, "loss": 0.094, "step": 3860 }, { "epoch": 0.4577714691270405, "grad_norm": 3.1886627674102783, "learning_rate": 2.670740849214143e-07, "loss": 0.0838, "step": 3870 }, { "epoch": 0.4589543411402886, "grad_norm": 5.524952411651611, "learning_rate": 2.6694982395974237e-07, "loss": 0.0865, "step": 3880 }, { "epoch": 0.4601372131535368, "grad_norm": 3.638477325439453, "learning_rate": 2.6682556299807047e-07, "loss": 0.0828, "step": 3890 }, { "epoch": 0.46132008516678497, "grad_norm": 2.2801597118377686, "learning_rate": 2.667013020363986e-07, "loss": 0.0775, "step": 3900 }, { "epoch": 0.46250295718003315, "grad_norm": 2.8219950199127197, "learning_rate": 2.665770410747267e-07, "loss": 0.0927, "step": 3910 }, { "epoch": 0.4636858291932813, "grad_norm": 3.390726089477539, "learning_rate": 2.664527801130548e-07, "loss": 0.0852, "step": 3920 }, { "epoch": 0.46486870120652946, "grad_norm": 2.7872021198272705, "learning_rate": 2.663285191513829e-07, "loss": 0.0775, "step": 3930 }, { "epoch": 0.46605157321977764, "grad_norm": 3.500958204269409, "learning_rate": 2.66204258189711e-07, "loss": 0.0837, "step": 3940 }, { "epoch": 0.46723444523302576, "grad_norm": 4.924251556396484, "learning_rate": 2.660799972280391e-07, "loss": 0.0751, "step": 3950 }, { "epoch": 0.46841731724627395, "grad_norm": 4.182793617248535, "learning_rate": 2.6595573626636716e-07, "loss": 0.0818, "step": 3960 }, { "epoch": 0.4696001892595221, "grad_norm": 4.644354343414307, "learning_rate": 2.6583147530469527e-07, "loss": 0.086, "step": 3970 }, { "epoch": 0.4707830612727703, "grad_norm": 3.6151394844055176, "learning_rate": 2.657072143430234e-07, "loss": 0.0849, "step": 3980 }, { "epoch": 0.47196593328601844, "grad_norm": 3.8727123737335205, "learning_rate": 2.655829533813515e-07, "loss": 0.0787, "step": 3990 }, { "epoch": 0.4731488052992666, "grad_norm": 3.742002010345459, "learning_rate": 2.654586924196796e-07, "loss": 0.0823, "step": 4000 }, { "epoch": 0.4743316773125148, "grad_norm": 2.547376871109009, "learning_rate": 2.653344314580077e-07, "loss": 0.0868, "step": 4010 }, { "epoch": 0.4755145493257629, "grad_norm": 2.687694787979126, "learning_rate": 2.652101704963358e-07, "loss": 0.0841, "step": 4020 }, { "epoch": 0.4766974213390111, "grad_norm": 4.799156665802002, "learning_rate": 2.650859095346639e-07, "loss": 0.0835, "step": 4030 }, { "epoch": 0.4778802933522593, "grad_norm": 3.3583476543426514, "learning_rate": 2.64961648572992e-07, "loss": 0.0788, "step": 4040 }, { "epoch": 0.47906316536550747, "grad_norm": 4.629757881164551, "learning_rate": 2.6483738761132007e-07, "loss": 0.0877, "step": 4050 }, { "epoch": 0.4802460373787556, "grad_norm": 3.169760227203369, "learning_rate": 2.647131266496482e-07, "loss": 0.0867, "step": 4060 }, { "epoch": 0.4814289093920038, "grad_norm": 3.9444119930267334, "learning_rate": 2.645888656879763e-07, "loss": 0.0853, "step": 4070 }, { "epoch": 0.48261178140525196, "grad_norm": 3.2847728729248047, "learning_rate": 2.644646047263044e-07, "loss": 0.0793, "step": 4080 }, { "epoch": 0.48379465341850014, "grad_norm": 3.440415620803833, "learning_rate": 2.643403437646325e-07, "loss": 0.0745, "step": 4090 }, { "epoch": 0.48497752543174827, "grad_norm": 4.316025257110596, "learning_rate": 2.642160828029606e-07, "loss": 0.0789, "step": 4100 }, { "epoch": 0.48616039744499645, "grad_norm": 3.3015058040618896, "learning_rate": 2.640918218412887e-07, "loss": 0.0879, "step": 4110 }, { "epoch": 0.48734326945824463, "grad_norm": 3.113067388534546, "learning_rate": 2.639675608796168e-07, "loss": 0.0909, "step": 4120 }, { "epoch": 0.48852614147149276, "grad_norm": 3.4626364707946777, "learning_rate": 2.6384329991794487e-07, "loss": 0.0849, "step": 4130 }, { "epoch": 0.48970901348474094, "grad_norm": 2.7530677318573, "learning_rate": 2.6371903895627297e-07, "loss": 0.0795, "step": 4140 }, { "epoch": 0.4908918854979891, "grad_norm": 3.2920970916748047, "learning_rate": 2.635947779946011e-07, "loss": 0.0844, "step": 4150 }, { "epoch": 0.4920747575112373, "grad_norm": 2.975703477859497, "learning_rate": 2.634705170329292e-07, "loss": 0.0849, "step": 4160 }, { "epoch": 0.49325762952448543, "grad_norm": 2.33211350440979, "learning_rate": 2.633462560712573e-07, "loss": 0.071, "step": 4170 }, { "epoch": 0.4944405015377336, "grad_norm": 4.29820442199707, "learning_rate": 2.632219951095854e-07, "loss": 0.084, "step": 4180 }, { "epoch": 0.4956233735509818, "grad_norm": 4.372604846954346, "learning_rate": 2.630977341479135e-07, "loss": 0.0824, "step": 4190 }, { "epoch": 0.49680624556423, "grad_norm": 2.9048173427581787, "learning_rate": 2.629734731862416e-07, "loss": 0.072, "step": 4200 }, { "epoch": 0.4979891175774781, "grad_norm": 4.972493648529053, "learning_rate": 2.6284921222456966e-07, "loss": 0.0823, "step": 4210 }, { "epoch": 0.4991719895907263, "grad_norm": 3.2681450843811035, "learning_rate": 2.6272495126289777e-07, "loss": 0.0802, "step": 4220 }, { "epoch": 0.5003548616039745, "grad_norm": 3.4494874477386475, "learning_rate": 2.626006903012259e-07, "loss": 0.0841, "step": 4230 }, { "epoch": 0.5015377336172226, "grad_norm": 3.1300835609436035, "learning_rate": 2.62476429339554e-07, "loss": 0.0792, "step": 4240 }, { "epoch": 0.5027206056304708, "grad_norm": 3.2256789207458496, "learning_rate": 2.623521683778821e-07, "loss": 0.0801, "step": 4250 }, { "epoch": 0.5039034776437189, "grad_norm": 4.700382709503174, "learning_rate": 2.622279074162102e-07, "loss": 0.0813, "step": 4260 }, { "epoch": 0.5050863496569671, "grad_norm": 4.027833461761475, "learning_rate": 2.621036464545383e-07, "loss": 0.0902, "step": 4270 }, { "epoch": 0.5062692216702153, "grad_norm": 3.9444737434387207, "learning_rate": 2.619793854928664e-07, "loss": 0.087, "step": 4280 }, { "epoch": 0.5074520936834634, "grad_norm": 3.043041944503784, "learning_rate": 2.618551245311945e-07, "loss": 0.0764, "step": 4290 }, { "epoch": 0.5086349656967116, "grad_norm": 3.8577401638031006, "learning_rate": 2.6173086356952257e-07, "loss": 0.0847, "step": 4300 }, { "epoch": 0.5098178377099598, "grad_norm": 2.511228322982788, "learning_rate": 2.616066026078507e-07, "loss": 0.0858, "step": 4310 }, { "epoch": 0.511000709723208, "grad_norm": 2.5092222690582275, "learning_rate": 2.6148234164617883e-07, "loss": 0.0876, "step": 4320 }, { "epoch": 0.5121835817364561, "grad_norm": 3.42560076713562, "learning_rate": 2.613580806845069e-07, "loss": 0.0813, "step": 4330 }, { "epoch": 0.5133664537497042, "grad_norm": 3.111553907394409, "learning_rate": 2.61233819722835e-07, "loss": 0.0798, "step": 4340 }, { "epoch": 0.5145493257629524, "grad_norm": 3.3020918369293213, "learning_rate": 2.611095587611631e-07, "loss": 0.0809, "step": 4350 }, { "epoch": 0.5157321977762006, "grad_norm": 3.632606267929077, "learning_rate": 2.609852977994912e-07, "loss": 0.084, "step": 4360 }, { "epoch": 0.5169150697894488, "grad_norm": 4.275643825531006, "learning_rate": 2.608610368378193e-07, "loss": 0.0811, "step": 4370 }, { "epoch": 0.518097941802697, "grad_norm": 2.417649507522583, "learning_rate": 2.6073677587614737e-07, "loss": 0.0821, "step": 4380 }, { "epoch": 0.5192808138159452, "grad_norm": 4.206284523010254, "learning_rate": 2.606125149144755e-07, "loss": 0.0818, "step": 4390 }, { "epoch": 0.5204636858291933, "grad_norm": 3.3791139125823975, "learning_rate": 2.604882539528036e-07, "loss": 0.0788, "step": 4400 }, { "epoch": 0.5216465578424414, "grad_norm": 2.728119373321533, "learning_rate": 2.603639929911317e-07, "loss": 0.0805, "step": 4410 }, { "epoch": 0.5228294298556896, "grad_norm": 3.5953633785247803, "learning_rate": 2.602397320294598e-07, "loss": 0.0819, "step": 4420 }, { "epoch": 0.5240123018689378, "grad_norm": 3.4545135498046875, "learning_rate": 2.601154710677879e-07, "loss": 0.0798, "step": 4430 }, { "epoch": 0.525195173882186, "grad_norm": 4.237427234649658, "learning_rate": 2.59991210106116e-07, "loss": 0.0882, "step": 4440 }, { "epoch": 0.5263780458954341, "grad_norm": 4.069971084594727, "learning_rate": 2.598669491444441e-07, "loss": 0.0813, "step": 4450 }, { "epoch": 0.5275609179086823, "grad_norm": 3.7415192127227783, "learning_rate": 2.597426881827722e-07, "loss": 0.0811, "step": 4460 }, { "epoch": 0.5287437899219305, "grad_norm": 2.989156723022461, "learning_rate": 2.5961842722110027e-07, "loss": 0.0866, "step": 4470 }, { "epoch": 0.5299266619351786, "grad_norm": 2.2155065536499023, "learning_rate": 2.594941662594284e-07, "loss": 0.0734, "step": 4480 }, { "epoch": 0.5311095339484267, "grad_norm": 4.160024642944336, "learning_rate": 2.5936990529775654e-07, "loss": 0.081, "step": 4490 }, { "epoch": 0.5322924059616749, "grad_norm": 4.046550273895264, "learning_rate": 2.592456443360846e-07, "loss": 0.08, "step": 4500 }, { "epoch": 0.5334752779749231, "grad_norm": 2.5924854278564453, "learning_rate": 2.591213833744127e-07, "loss": 0.0843, "step": 4510 }, { "epoch": 0.5346581499881713, "grad_norm": 3.4948198795318604, "learning_rate": 2.589971224127408e-07, "loss": 0.0913, "step": 4520 }, { "epoch": 0.5358410220014195, "grad_norm": 3.6743836402893066, "learning_rate": 2.588728614510689e-07, "loss": 0.0784, "step": 4530 }, { "epoch": 0.5370238940146677, "grad_norm": 3.533297061920166, "learning_rate": 2.58748600489397e-07, "loss": 0.0777, "step": 4540 }, { "epoch": 0.5382067660279157, "grad_norm": 2.7210309505462646, "learning_rate": 2.5862433952772507e-07, "loss": 0.0773, "step": 4550 }, { "epoch": 0.5393896380411639, "grad_norm": 3.5995471477508545, "learning_rate": 2.585000785660532e-07, "loss": 0.0927, "step": 4560 }, { "epoch": 0.5405725100544121, "grad_norm": 2.4642326831817627, "learning_rate": 2.5837581760438134e-07, "loss": 0.0831, "step": 4570 }, { "epoch": 0.5417553820676603, "grad_norm": 2.7347187995910645, "learning_rate": 2.582515566427094e-07, "loss": 0.0798, "step": 4580 }, { "epoch": 0.5429382540809085, "grad_norm": 4.410131931304932, "learning_rate": 2.581272956810375e-07, "loss": 0.0814, "step": 4590 }, { "epoch": 0.5441211260941566, "grad_norm": 3.530869483947754, "learning_rate": 2.580030347193656e-07, "loss": 0.0764, "step": 4600 }, { "epoch": 0.5453039981074048, "grad_norm": 2.71795916557312, "learning_rate": 2.578787737576937e-07, "loss": 0.0908, "step": 4610 }, { "epoch": 0.5464868701206529, "grad_norm": 4.219018936157227, "learning_rate": 2.577545127960218e-07, "loss": 0.0863, "step": 4620 }, { "epoch": 0.5476697421339011, "grad_norm": 2.4465043544769287, "learning_rate": 2.5763025183434987e-07, "loss": 0.077, "step": 4630 }, { "epoch": 0.5488526141471493, "grad_norm": 3.7928225994110107, "learning_rate": 2.57505990872678e-07, "loss": 0.08, "step": 4640 }, { "epoch": 0.5500354861603974, "grad_norm": 2.131216526031494, "learning_rate": 2.5738172991100613e-07, "loss": 0.086, "step": 4650 }, { "epoch": 0.5512183581736456, "grad_norm": 2.0925304889678955, "learning_rate": 2.572574689493342e-07, "loss": 0.0875, "step": 4660 }, { "epoch": 0.5524012301868938, "grad_norm": 2.857017755508423, "learning_rate": 2.571332079876623e-07, "loss": 0.0808, "step": 4670 }, { "epoch": 0.553584102200142, "grad_norm": 3.693185329437256, "learning_rate": 2.570089470259904e-07, "loss": 0.0797, "step": 4680 }, { "epoch": 0.5547669742133902, "grad_norm": 2.9493167400360107, "learning_rate": 2.568846860643185e-07, "loss": 0.082, "step": 4690 }, { "epoch": 0.5559498462266382, "grad_norm": 3.8919827938079834, "learning_rate": 2.567604251026466e-07, "loss": 0.082, "step": 4700 }, { "epoch": 0.5571327182398864, "grad_norm": 5.387547016143799, "learning_rate": 2.566361641409747e-07, "loss": 0.0839, "step": 4710 }, { "epoch": 0.5583155902531346, "grad_norm": 3.417860507965088, "learning_rate": 2.5651190317930277e-07, "loss": 0.0884, "step": 4720 }, { "epoch": 0.5594984622663828, "grad_norm": 2.7079994678497314, "learning_rate": 2.563876422176309e-07, "loss": 0.0814, "step": 4730 }, { "epoch": 0.560681334279631, "grad_norm": 2.8376657962799072, "learning_rate": 2.5626338125595904e-07, "loss": 0.0889, "step": 4740 }, { "epoch": 0.5618642062928791, "grad_norm": 2.505284547805786, "learning_rate": 2.561391202942871e-07, "loss": 0.0752, "step": 4750 }, { "epoch": 0.5630470783061273, "grad_norm": 3.0635619163513184, "learning_rate": 2.560148593326152e-07, "loss": 0.0863, "step": 4760 }, { "epoch": 0.5642299503193754, "grad_norm": 3.097635507583618, "learning_rate": 2.558905983709433e-07, "loss": 0.0822, "step": 4770 }, { "epoch": 0.5654128223326236, "grad_norm": 2.6020431518554688, "learning_rate": 2.557663374092714e-07, "loss": 0.0815, "step": 4780 }, { "epoch": 0.5665956943458718, "grad_norm": 2.973048210144043, "learning_rate": 2.556420764475995e-07, "loss": 0.0805, "step": 4790 }, { "epoch": 0.5677785663591199, "grad_norm": 2.0770013332366943, "learning_rate": 2.5551781548592757e-07, "loss": 0.0727, "step": 4800 }, { "epoch": 0.5689614383723681, "grad_norm": 2.964733600616455, "learning_rate": 2.553935545242557e-07, "loss": 0.0863, "step": 4810 }, { "epoch": 0.5701443103856163, "grad_norm": 2.7105531692504883, "learning_rate": 2.5526929356258384e-07, "loss": 0.0773, "step": 4820 }, { "epoch": 0.5713271823988645, "grad_norm": 3.38931941986084, "learning_rate": 2.551450326009119e-07, "loss": 0.0798, "step": 4830 }, { "epoch": 0.5725100544121126, "grad_norm": 2.3484251499176025, "learning_rate": 2.5502077163924e-07, "loss": 0.0723, "step": 4840 }, { "epoch": 0.5736929264253607, "grad_norm": 2.74106764793396, "learning_rate": 2.548965106775681e-07, "loss": 0.0728, "step": 4850 }, { "epoch": 0.5748757984386089, "grad_norm": 3.726855516433716, "learning_rate": 2.547722497158962e-07, "loss": 0.0932, "step": 4860 }, { "epoch": 0.5760586704518571, "grad_norm": 3.3901708126068115, "learning_rate": 2.546479887542243e-07, "loss": 0.0801, "step": 4870 }, { "epoch": 0.5772415424651053, "grad_norm": 3.835559844970703, "learning_rate": 2.545237277925524e-07, "loss": 0.0786, "step": 4880 }, { "epoch": 0.5784244144783535, "grad_norm": 2.2467880249023438, "learning_rate": 2.543994668308805e-07, "loss": 0.0747, "step": 4890 }, { "epoch": 0.5796072864916016, "grad_norm": 3.044595241546631, "learning_rate": 2.5427520586920864e-07, "loss": 0.0754, "step": 4900 }, { "epoch": 0.5807901585048498, "grad_norm": 2.1652138233184814, "learning_rate": 2.5415094490753674e-07, "loss": 0.0872, "step": 4910 }, { "epoch": 0.5819730305180979, "grad_norm": 3.320662498474121, "learning_rate": 2.540266839458648e-07, "loss": 0.0848, "step": 4920 }, { "epoch": 0.5831559025313461, "grad_norm": 2.9915237426757812, "learning_rate": 2.539024229841929e-07, "loss": 0.0829, "step": 4930 }, { "epoch": 0.5843387745445943, "grad_norm": 3.946162223815918, "learning_rate": 2.53778162022521e-07, "loss": 0.0842, "step": 4940 }, { "epoch": 0.5855216465578424, "grad_norm": 2.3292176723480225, "learning_rate": 2.536539010608491e-07, "loss": 0.0762, "step": 4950 }, { "epoch": 0.5867045185710906, "grad_norm": 3.8679192066192627, "learning_rate": 2.535296400991772e-07, "loss": 0.0838, "step": 4960 }, { "epoch": 0.5878873905843388, "grad_norm": 3.16050124168396, "learning_rate": 2.534053791375053e-07, "loss": 0.0871, "step": 4970 }, { "epoch": 0.589070262597587, "grad_norm": 3.7792184352874756, "learning_rate": 2.5328111817583343e-07, "loss": 0.0793, "step": 4980 }, { "epoch": 0.5902531346108351, "grad_norm": 3.2933349609375, "learning_rate": 2.5315685721416154e-07, "loss": 0.0828, "step": 4990 }, { "epoch": 0.5914360066240832, "grad_norm": 3.5960817337036133, "learning_rate": 2.530325962524896e-07, "loss": 0.0767, "step": 5000 }, { "epoch": 0.5926188786373314, "grad_norm": 2.6037306785583496, "learning_rate": 2.529083352908177e-07, "loss": 0.0849, "step": 5010 }, { "epoch": 0.5938017506505796, "grad_norm": 4.315911769866943, "learning_rate": 2.527840743291458e-07, "loss": 0.077, "step": 5020 }, { "epoch": 0.5949846226638278, "grad_norm": 4.2913923263549805, "learning_rate": 2.526598133674739e-07, "loss": 0.0817, "step": 5030 }, { "epoch": 0.596167494677076, "grad_norm": 3.3926076889038086, "learning_rate": 2.52535552405802e-07, "loss": 0.0701, "step": 5040 }, { "epoch": 0.5973503666903242, "grad_norm": 3.2948966026306152, "learning_rate": 2.5241129144413007e-07, "loss": 0.0726, "step": 5050 }, { "epoch": 0.5985332387035722, "grad_norm": 2.258929491043091, "learning_rate": 2.5228703048245823e-07, "loss": 0.0832, "step": 5060 }, { "epoch": 0.5997161107168204, "grad_norm": 2.304443836212158, "learning_rate": 2.5216276952078634e-07, "loss": 0.0906, "step": 5070 }, { "epoch": 0.6008989827300686, "grad_norm": 3.56166934967041, "learning_rate": 2.520385085591144e-07, "loss": 0.0805, "step": 5080 }, { "epoch": 0.6020818547433168, "grad_norm": 2.402224540710449, "learning_rate": 2.519142475974425e-07, "loss": 0.0809, "step": 5090 }, { "epoch": 0.603264726756565, "grad_norm": 4.592747688293457, "learning_rate": 2.517899866357706e-07, "loss": 0.0867, "step": 5100 }, { "epoch": 0.6044475987698131, "grad_norm": 3.803203821182251, "learning_rate": 2.516657256740987e-07, "loss": 0.0817, "step": 5110 }, { "epoch": 0.6056304707830613, "grad_norm": 3.0419344902038574, "learning_rate": 2.515414647124268e-07, "loss": 0.0908, "step": 5120 }, { "epoch": 0.6068133427963094, "grad_norm": 2.681278705596924, "learning_rate": 2.514172037507549e-07, "loss": 0.0783, "step": 5130 }, { "epoch": 0.6079962148095576, "grad_norm": 3.1544816493988037, "learning_rate": 2.51292942789083e-07, "loss": 0.0773, "step": 5140 }, { "epoch": 0.6091790868228057, "grad_norm": 3.6531503200531006, "learning_rate": 2.5116868182741114e-07, "loss": 0.0877, "step": 5150 }, { "epoch": 0.6103619588360539, "grad_norm": 3.3948307037353516, "learning_rate": 2.5104442086573924e-07, "loss": 0.0862, "step": 5160 }, { "epoch": 0.6115448308493021, "grad_norm": 2.950212001800537, "learning_rate": 2.509201599040673e-07, "loss": 0.0791, "step": 5170 }, { "epoch": 0.6127277028625503, "grad_norm": 4.5383405685424805, "learning_rate": 2.507958989423954e-07, "loss": 0.0836, "step": 5180 }, { "epoch": 0.6139105748757985, "grad_norm": 4.1940789222717285, "learning_rate": 2.506716379807235e-07, "loss": 0.074, "step": 5190 }, { "epoch": 0.6150934468890467, "grad_norm": 2.8239166736602783, "learning_rate": 2.505473770190516e-07, "loss": 0.0794, "step": 5200 }, { "epoch": 0.6162763189022947, "grad_norm": 2.7730705738067627, "learning_rate": 2.504231160573797e-07, "loss": 0.0862, "step": 5210 }, { "epoch": 0.6174591909155429, "grad_norm": 2.8319149017333984, "learning_rate": 2.502988550957078e-07, "loss": 0.0806, "step": 5220 }, { "epoch": 0.6186420629287911, "grad_norm": 4.9800310134887695, "learning_rate": 2.5017459413403593e-07, "loss": 0.0802, "step": 5230 }, { "epoch": 0.6198249349420393, "grad_norm": 4.146650791168213, "learning_rate": 2.5005033317236404e-07, "loss": 0.0802, "step": 5240 }, { "epoch": 0.6210078069552875, "grad_norm": 2.177483558654785, "learning_rate": 2.499260722106921e-07, "loss": 0.0819, "step": 5250 }, { "epoch": 0.6221906789685356, "grad_norm": 2.3585174083709717, "learning_rate": 2.498018112490202e-07, "loss": 0.0813, "step": 5260 }, { "epoch": 0.6233735509817838, "grad_norm": 2.0580697059631348, "learning_rate": 2.496775502873483e-07, "loss": 0.0732, "step": 5270 }, { "epoch": 0.6245564229950319, "grad_norm": 2.812100887298584, "learning_rate": 2.495532893256764e-07, "loss": 0.0909, "step": 5280 }, { "epoch": 0.6257392950082801, "grad_norm": 3.3786368370056152, "learning_rate": 2.494290283640045e-07, "loss": 0.0793, "step": 5290 }, { "epoch": 0.6269221670215283, "grad_norm": 3.011425256729126, "learning_rate": 2.493047674023326e-07, "loss": 0.0847, "step": 5300 }, { "epoch": 0.6281050390347764, "grad_norm": 3.313220262527466, "learning_rate": 2.4918050644066073e-07, "loss": 0.0872, "step": 5310 }, { "epoch": 0.6292879110480246, "grad_norm": 3.082502603530884, "learning_rate": 2.4905624547898884e-07, "loss": 0.0763, "step": 5320 }, { "epoch": 0.6304707830612728, "grad_norm": 2.578134059906006, "learning_rate": 2.4893198451731695e-07, "loss": 0.0782, "step": 5330 }, { "epoch": 0.631653655074521, "grad_norm": 2.828599691390991, "learning_rate": 2.48807723555645e-07, "loss": 0.0777, "step": 5340 }, { "epoch": 0.632836527087769, "grad_norm": 4.773819446563721, "learning_rate": 2.486834625939731e-07, "loss": 0.0786, "step": 5350 }, { "epoch": 0.6340193991010172, "grad_norm": 2.863616943359375, "learning_rate": 2.485592016323012e-07, "loss": 0.0955, "step": 5360 }, { "epoch": 0.6352022711142654, "grad_norm": 2.579099416732788, "learning_rate": 2.484349406706293e-07, "loss": 0.0814, "step": 5370 }, { "epoch": 0.6363851431275136, "grad_norm": 2.2746381759643555, "learning_rate": 2.483106797089574e-07, "loss": 0.0821, "step": 5380 }, { "epoch": 0.6375680151407618, "grad_norm": 3.0017411708831787, "learning_rate": 2.4818641874728553e-07, "loss": 0.0856, "step": 5390 }, { "epoch": 0.63875088715401, "grad_norm": 2.481255054473877, "learning_rate": 2.4806215778561364e-07, "loss": 0.0803, "step": 5400 }, { "epoch": 0.6399337591672581, "grad_norm": 3.7967991828918457, "learning_rate": 2.4793789682394174e-07, "loss": 0.0869, "step": 5410 }, { "epoch": 0.6411166311805063, "grad_norm": 3.630337715148926, "learning_rate": 2.478136358622698e-07, "loss": 0.0879, "step": 5420 }, { "epoch": 0.6422995031937544, "grad_norm": 2.8189852237701416, "learning_rate": 2.476893749005979e-07, "loss": 0.0829, "step": 5430 }, { "epoch": 0.6434823752070026, "grad_norm": 4.036816120147705, "learning_rate": 2.47565113938926e-07, "loss": 0.0756, "step": 5440 }, { "epoch": 0.6446652472202508, "grad_norm": 2.823315143585205, "learning_rate": 2.474408529772541e-07, "loss": 0.0763, "step": 5450 }, { "epoch": 0.6458481192334989, "grad_norm": 1.7750979661941528, "learning_rate": 2.473165920155822e-07, "loss": 0.0892, "step": 5460 }, { "epoch": 0.6470309912467471, "grad_norm": 2.780932903289795, "learning_rate": 2.4719233105391033e-07, "loss": 0.0865, "step": 5470 }, { "epoch": 0.6482138632599953, "grad_norm": 2.381618022918701, "learning_rate": 2.4706807009223844e-07, "loss": 0.076, "step": 5480 }, { "epoch": 0.6493967352732435, "grad_norm": 4.224671363830566, "learning_rate": 2.4694380913056654e-07, "loss": 0.082, "step": 5490 }, { "epoch": 0.6505796072864916, "grad_norm": 2.859919786453247, "learning_rate": 2.468195481688946e-07, "loss": 0.0771, "step": 5500 }, { "epoch": 0.6517624792997397, "grad_norm": 2.8784821033477783, "learning_rate": 2.466952872072227e-07, "loss": 0.0857, "step": 5510 }, { "epoch": 0.6529453513129879, "grad_norm": 2.42313551902771, "learning_rate": 2.465710262455508e-07, "loss": 0.0781, "step": 5520 }, { "epoch": 0.6541282233262361, "grad_norm": 2.7595460414886475, "learning_rate": 2.464467652838789e-07, "loss": 0.0762, "step": 5530 }, { "epoch": 0.6553110953394843, "grad_norm": 3.1140928268432617, "learning_rate": 2.46322504322207e-07, "loss": 0.0742, "step": 5540 }, { "epoch": 0.6564939673527325, "grad_norm": 4.461252689361572, "learning_rate": 2.4619824336053513e-07, "loss": 0.0838, "step": 5550 }, { "epoch": 0.6576768393659806, "grad_norm": 4.465931415557861, "learning_rate": 2.4607398239886323e-07, "loss": 0.0889, "step": 5560 }, { "epoch": 0.6588597113792287, "grad_norm": 3.2919321060180664, "learning_rate": 2.4594972143719134e-07, "loss": 0.0806, "step": 5570 }, { "epoch": 0.6600425833924769, "grad_norm": 2.6106438636779785, "learning_rate": 2.4582546047551945e-07, "loss": 0.0836, "step": 5580 }, { "epoch": 0.6612254554057251, "grad_norm": 2.3157975673675537, "learning_rate": 2.457011995138475e-07, "loss": 0.0829, "step": 5590 }, { "epoch": 0.6624083274189733, "grad_norm": 2.2922074794769287, "learning_rate": 2.455769385521756e-07, "loss": 0.074, "step": 5600 }, { "epoch": 0.6635911994322214, "grad_norm": 3.701192855834961, "learning_rate": 2.454526775905037e-07, "loss": 0.0852, "step": 5610 }, { "epoch": 0.6647740714454696, "grad_norm": 2.786501169204712, "learning_rate": 2.453284166288318e-07, "loss": 0.0887, "step": 5620 }, { "epoch": 0.6659569434587178, "grad_norm": 3.2570290565490723, "learning_rate": 2.452041556671599e-07, "loss": 0.0792, "step": 5630 }, { "epoch": 0.6671398154719659, "grad_norm": 3.5635123252868652, "learning_rate": 2.4507989470548803e-07, "loss": 0.0785, "step": 5640 }, { "epoch": 0.6683226874852141, "grad_norm": 2.4307689666748047, "learning_rate": 2.4495563374381614e-07, "loss": 0.0777, "step": 5650 }, { "epoch": 0.6695055594984622, "grad_norm": 1.9711661338806152, "learning_rate": 2.4483137278214424e-07, "loss": 0.0821, "step": 5660 }, { "epoch": 0.6706884315117104, "grad_norm": 2.648186683654785, "learning_rate": 2.447071118204723e-07, "loss": 0.0838, "step": 5670 }, { "epoch": 0.6718713035249586, "grad_norm": 2.232123851776123, "learning_rate": 2.445828508588004e-07, "loss": 0.0796, "step": 5680 }, { "epoch": 0.6730541755382068, "grad_norm": 4.107615947723389, "learning_rate": 2.444585898971285e-07, "loss": 0.0746, "step": 5690 }, { "epoch": 0.674237047551455, "grad_norm": 3.2002999782562256, "learning_rate": 2.443343289354566e-07, "loss": 0.0761, "step": 5700 }, { "epoch": 0.6754199195647032, "grad_norm": 4.873493194580078, "learning_rate": 2.442100679737847e-07, "loss": 0.081, "step": 5710 }, { "epoch": 0.6766027915779512, "grad_norm": 3.1180319786071777, "learning_rate": 2.4408580701211283e-07, "loss": 0.0815, "step": 5720 }, { "epoch": 0.6777856635911994, "grad_norm": 4.457919597625732, "learning_rate": 2.4396154605044094e-07, "loss": 0.081, "step": 5730 }, { "epoch": 0.6789685356044476, "grad_norm": 3.532252788543701, "learning_rate": 2.4383728508876904e-07, "loss": 0.0828, "step": 5740 }, { "epoch": 0.6801514076176958, "grad_norm": 2.8403751850128174, "learning_rate": 2.4371302412709715e-07, "loss": 0.0802, "step": 5750 }, { "epoch": 0.681334279630944, "grad_norm": 2.818466901779175, "learning_rate": 2.435887631654252e-07, "loss": 0.0913, "step": 5760 }, { "epoch": 0.6825171516441921, "grad_norm": 2.276233434677124, "learning_rate": 2.434645022037533e-07, "loss": 0.0933, "step": 5770 }, { "epoch": 0.6837000236574403, "grad_norm": 3.335726022720337, "learning_rate": 2.433402412420814e-07, "loss": 0.0816, "step": 5780 }, { "epoch": 0.6848828956706884, "grad_norm": 2.766068696975708, "learning_rate": 2.432159802804095e-07, "loss": 0.0905, "step": 5790 }, { "epoch": 0.6860657676839366, "grad_norm": 3.8213112354278564, "learning_rate": 2.4309171931873763e-07, "loss": 0.081, "step": 5800 }, { "epoch": 0.6872486396971847, "grad_norm": 2.560025691986084, "learning_rate": 2.4296745835706573e-07, "loss": 0.0834, "step": 5810 }, { "epoch": 0.6884315117104329, "grad_norm": 2.7715392112731934, "learning_rate": 2.4284319739539384e-07, "loss": 0.0809, "step": 5820 }, { "epoch": 0.6896143837236811, "grad_norm": 3.221958637237549, "learning_rate": 2.4271893643372195e-07, "loss": 0.0806, "step": 5830 }, { "epoch": 0.6907972557369293, "grad_norm": 2.649260997772217, "learning_rate": 2.4259467547205e-07, "loss": 0.0789, "step": 5840 }, { "epoch": 0.6919801277501775, "grad_norm": 3.489596366882324, "learning_rate": 2.424704145103781e-07, "loss": 0.0779, "step": 5850 }, { "epoch": 0.6931629997634255, "grad_norm": 2.644028425216675, "learning_rate": 2.423461535487062e-07, "loss": 0.0886, "step": 5860 }, { "epoch": 0.6943458717766737, "grad_norm": 2.0373616218566895, "learning_rate": 2.422218925870343e-07, "loss": 0.0757, "step": 5870 }, { "epoch": 0.6955287437899219, "grad_norm": 3.59130597114563, "learning_rate": 2.420976316253624e-07, "loss": 0.0811, "step": 5880 }, { "epoch": 0.6967116158031701, "grad_norm": 2.031593084335327, "learning_rate": 2.4197337066369053e-07, "loss": 0.0764, "step": 5890 }, { "epoch": 0.6978944878164183, "grad_norm": 2.994112014770508, "learning_rate": 2.4184910970201864e-07, "loss": 0.0777, "step": 5900 }, { "epoch": 0.6990773598296665, "grad_norm": 2.3336379528045654, "learning_rate": 2.4172484874034675e-07, "loss": 0.0826, "step": 5910 }, { "epoch": 0.7002602318429146, "grad_norm": 2.3208582401275635, "learning_rate": 2.416005877786748e-07, "loss": 0.0773, "step": 5920 }, { "epoch": 0.7014431038561627, "grad_norm": 3.1840691566467285, "learning_rate": 2.414763268170029e-07, "loss": 0.0819, "step": 5930 }, { "epoch": 0.7026259758694109, "grad_norm": 3.5671145915985107, "learning_rate": 2.41352065855331e-07, "loss": 0.0715, "step": 5940 }, { "epoch": 0.7038088478826591, "grad_norm": 3.215927839279175, "learning_rate": 2.412278048936591e-07, "loss": 0.0826, "step": 5950 }, { "epoch": 0.7049917198959073, "grad_norm": 2.7185540199279785, "learning_rate": 2.411035439319872e-07, "loss": 0.0821, "step": 5960 }, { "epoch": 0.7061745919091554, "grad_norm": 3.2000768184661865, "learning_rate": 2.4097928297031533e-07, "loss": 0.0778, "step": 5970 }, { "epoch": 0.7073574639224036, "grad_norm": 2.6332738399505615, "learning_rate": 2.4085502200864344e-07, "loss": 0.0783, "step": 5980 }, { "epoch": 0.7085403359356518, "grad_norm": 3.1832165718078613, "learning_rate": 2.4073076104697154e-07, "loss": 0.0822, "step": 5990 }, { "epoch": 0.7097232079489, "grad_norm": 2.2297587394714355, "learning_rate": 2.4060650008529965e-07, "loss": 0.0734, "step": 6000 }, { "epoch": 0.710906079962148, "grad_norm": 3.3259570598602295, "learning_rate": 2.404822391236277e-07, "loss": 0.0833, "step": 6010 }, { "epoch": 0.7120889519753962, "grad_norm": 2.982093095779419, "learning_rate": 2.403579781619558e-07, "loss": 0.0838, "step": 6020 }, { "epoch": 0.7132718239886444, "grad_norm": 2.8181416988372803, "learning_rate": 2.402337172002839e-07, "loss": 0.0832, "step": 6030 }, { "epoch": 0.7144546960018926, "grad_norm": 3.1714606285095215, "learning_rate": 2.40109456238612e-07, "loss": 0.0801, "step": 6040 }, { "epoch": 0.7156375680151408, "grad_norm": 2.2528295516967773, "learning_rate": 2.3998519527694013e-07, "loss": 0.0852, "step": 6050 }, { "epoch": 0.716820440028389, "grad_norm": 3.7271523475646973, "learning_rate": 2.3986093431526824e-07, "loss": 0.0841, "step": 6060 }, { "epoch": 0.7180033120416371, "grad_norm": 3.5095536708831787, "learning_rate": 2.3973667335359634e-07, "loss": 0.0807, "step": 6070 }, { "epoch": 0.7191861840548852, "grad_norm": 2.3852179050445557, "learning_rate": 2.3961241239192445e-07, "loss": 0.0829, "step": 6080 }, { "epoch": 0.7203690560681334, "grad_norm": 2.8258583545684814, "learning_rate": 2.394881514302525e-07, "loss": 0.0768, "step": 6090 }, { "epoch": 0.7215519280813816, "grad_norm": 2.3920257091522217, "learning_rate": 2.393638904685806e-07, "loss": 0.071, "step": 6100 }, { "epoch": 0.7227348000946298, "grad_norm": 2.7465298175811768, "learning_rate": 2.392396295069087e-07, "loss": 0.0847, "step": 6110 }, { "epoch": 0.7239176721078779, "grad_norm": 4.108574390411377, "learning_rate": 2.391153685452368e-07, "loss": 0.0873, "step": 6120 }, { "epoch": 0.7251005441211261, "grad_norm": 4.010007858276367, "learning_rate": 2.3899110758356493e-07, "loss": 0.0796, "step": 6130 }, { "epoch": 0.7262834161343743, "grad_norm": 3.0546953678131104, "learning_rate": 2.3886684662189303e-07, "loss": 0.0843, "step": 6140 }, { "epoch": 0.7274662881476224, "grad_norm": 4.024850368499756, "learning_rate": 2.3874258566022114e-07, "loss": 0.0734, "step": 6150 }, { "epoch": 0.7286491601608706, "grad_norm": 3.2594528198242188, "learning_rate": 2.3861832469854925e-07, "loss": 0.0829, "step": 6160 }, { "epoch": 0.7298320321741187, "grad_norm": 2.9666037559509277, "learning_rate": 2.3849406373687735e-07, "loss": 0.0794, "step": 6170 }, { "epoch": 0.7310149041873669, "grad_norm": 3.204150438308716, "learning_rate": 2.3836980277520543e-07, "loss": 0.0758, "step": 6180 }, { "epoch": 0.7321977762006151, "grad_norm": 3.178006649017334, "learning_rate": 2.382455418135335e-07, "loss": 0.0741, "step": 6190 }, { "epoch": 0.7333806482138633, "grad_norm": 3.7335448265075684, "learning_rate": 2.3812128085186162e-07, "loss": 0.0766, "step": 6200 }, { "epoch": 0.7345635202271115, "grad_norm": 2.433701992034912, "learning_rate": 2.3799701989018975e-07, "loss": 0.0853, "step": 6210 }, { "epoch": 0.7357463922403596, "grad_norm": 3.106215715408325, "learning_rate": 2.3787275892851783e-07, "loss": 0.0884, "step": 6220 }, { "epoch": 0.7369292642536077, "grad_norm": 3.132920503616333, "learning_rate": 2.3774849796684594e-07, "loss": 0.0835, "step": 6230 }, { "epoch": 0.7381121362668559, "grad_norm": 3.186356782913208, "learning_rate": 2.3762423700517404e-07, "loss": 0.0847, "step": 6240 }, { "epoch": 0.7392950082801041, "grad_norm": 2.6927621364593506, "learning_rate": 2.3749997604350212e-07, "loss": 0.073, "step": 6250 }, { "epoch": 0.7404778802933523, "grad_norm": 2.6097283363342285, "learning_rate": 2.3737571508183023e-07, "loss": 0.0947, "step": 6260 }, { "epoch": 0.7416607523066004, "grad_norm": 2.7614214420318604, "learning_rate": 2.372514541201583e-07, "loss": 0.0865, "step": 6270 }, { "epoch": 0.7428436243198486, "grad_norm": 3.993428945541382, "learning_rate": 2.3712719315848642e-07, "loss": 0.075, "step": 6280 }, { "epoch": 0.7440264963330968, "grad_norm": 2.393604278564453, "learning_rate": 2.3700293219681452e-07, "loss": 0.0773, "step": 6290 }, { "epoch": 0.7452093683463449, "grad_norm": 2.7773702144622803, "learning_rate": 2.3687867123514266e-07, "loss": 0.0835, "step": 6300 }, { "epoch": 0.7463922403595931, "grad_norm": 2.0995032787323, "learning_rate": 2.3675441027347074e-07, "loss": 0.0829, "step": 6310 }, { "epoch": 0.7475751123728412, "grad_norm": 3.592357635498047, "learning_rate": 2.3663014931179884e-07, "loss": 0.0821, "step": 6320 }, { "epoch": 0.7487579843860894, "grad_norm": 2.4736573696136475, "learning_rate": 2.3650588835012692e-07, "loss": 0.0832, "step": 6330 }, { "epoch": 0.7499408563993376, "grad_norm": 3.351445436477661, "learning_rate": 2.3638162738845503e-07, "loss": 0.0843, "step": 6340 }, { "epoch": 0.7511237284125858, "grad_norm": 3.0077385902404785, "learning_rate": 2.3625736642678314e-07, "loss": 0.0881, "step": 6350 }, { "epoch": 0.752306600425834, "grad_norm": 2.7538204193115234, "learning_rate": 2.3613310546511122e-07, "loss": 0.0868, "step": 6360 }, { "epoch": 0.753489472439082, "grad_norm": 2.9392008781433105, "learning_rate": 2.3600884450343932e-07, "loss": 0.0823, "step": 6370 }, { "epoch": 0.7546723444523302, "grad_norm": 2.259934902191162, "learning_rate": 2.3588458354176745e-07, "loss": 0.0866, "step": 6380 }, { "epoch": 0.7558552164655784, "grad_norm": 2.968602180480957, "learning_rate": 2.3576032258009553e-07, "loss": 0.082, "step": 6390 }, { "epoch": 0.7570380884788266, "grad_norm": 2.4034950733184814, "learning_rate": 2.3563606161842364e-07, "loss": 0.077, "step": 6400 }, { "epoch": 0.7582209604920748, "grad_norm": 2.9049994945526123, "learning_rate": 2.3551180065675175e-07, "loss": 0.0944, "step": 6410 }, { "epoch": 0.759403832505323, "grad_norm": 2.642573833465576, "learning_rate": 2.3538753969507983e-07, "loss": 0.0822, "step": 6420 }, { "epoch": 0.7605867045185711, "grad_norm": 2.7407031059265137, "learning_rate": 2.3526327873340793e-07, "loss": 0.0829, "step": 6430 }, { "epoch": 0.7617695765318192, "grad_norm": 2.2683587074279785, "learning_rate": 2.3513901777173601e-07, "loss": 0.088, "step": 6440 }, { "epoch": 0.7629524485450674, "grad_norm": 1.9789767265319824, "learning_rate": 2.3501475681006412e-07, "loss": 0.0829, "step": 6450 }, { "epoch": 0.7641353205583156, "grad_norm": 2.4857356548309326, "learning_rate": 2.3489049584839225e-07, "loss": 0.0833, "step": 6460 }, { "epoch": 0.7653181925715637, "grad_norm": 2.356732130050659, "learning_rate": 2.3476623488672033e-07, "loss": 0.0856, "step": 6470 }, { "epoch": 0.7665010645848119, "grad_norm": 2.9552032947540283, "learning_rate": 2.3464197392504844e-07, "loss": 0.0803, "step": 6480 }, { "epoch": 0.7676839365980601, "grad_norm": 2.7772109508514404, "learning_rate": 2.3451771296337655e-07, "loss": 0.0769, "step": 6490 }, { "epoch": 0.7688668086113083, "grad_norm": 3.5251197814941406, "learning_rate": 2.3439345200170463e-07, "loss": 0.0811, "step": 6500 }, { "epoch": 0.7700496806245565, "grad_norm": 2.6038928031921387, "learning_rate": 2.3426919104003273e-07, "loss": 0.0877, "step": 6510 }, { "epoch": 0.7712325526378045, "grad_norm": 2.533623695373535, "learning_rate": 2.3414493007836084e-07, "loss": 0.0872, "step": 6520 }, { "epoch": 0.7724154246510527, "grad_norm": 2.4284827709198, "learning_rate": 2.3402066911668892e-07, "loss": 0.0762, "step": 6530 }, { "epoch": 0.7735982966643009, "grad_norm": 3.8018832206726074, "learning_rate": 2.3389640815501705e-07, "loss": 0.0836, "step": 6540 }, { "epoch": 0.7747811686775491, "grad_norm": 2.3814854621887207, "learning_rate": 2.3377214719334516e-07, "loss": 0.0763, "step": 6550 }, { "epoch": 0.7759640406907973, "grad_norm": 2.8296656608581543, "learning_rate": 2.3364788623167324e-07, "loss": 0.0817, "step": 6560 }, { "epoch": 0.7771469127040455, "grad_norm": 2.8221046924591064, "learning_rate": 2.3352362527000134e-07, "loss": 0.0756, "step": 6570 }, { "epoch": 0.7783297847172936, "grad_norm": 4.415069103240967, "learning_rate": 2.3339936430832942e-07, "loss": 0.0789, "step": 6580 }, { "epoch": 0.7795126567305417, "grad_norm": 2.009554386138916, "learning_rate": 2.3327510334665753e-07, "loss": 0.0801, "step": 6590 }, { "epoch": 0.7806955287437899, "grad_norm": 2.646263360977173, "learning_rate": 2.3315084238498564e-07, "loss": 0.09, "step": 6600 }, { "epoch": 0.7818784007570381, "grad_norm": 2.942478895187378, "learning_rate": 2.3302658142331372e-07, "loss": 0.0829, "step": 6610 }, { "epoch": 0.7830612727702863, "grad_norm": 3.396937131881714, "learning_rate": 2.3290232046164185e-07, "loss": 0.0738, "step": 6620 }, { "epoch": 0.7842441447835344, "grad_norm": 4.119237899780273, "learning_rate": 2.3277805949996996e-07, "loss": 0.0835, "step": 6630 }, { "epoch": 0.7854270167967826, "grad_norm": 3.1859889030456543, "learning_rate": 2.3265379853829804e-07, "loss": 0.0778, "step": 6640 }, { "epoch": 0.7866098888100308, "grad_norm": 2.9354522228240967, "learning_rate": 2.3252953757662614e-07, "loss": 0.0867, "step": 6650 }, { "epoch": 0.7877927608232789, "grad_norm": 2.7523465156555176, "learning_rate": 2.3240527661495425e-07, "loss": 0.0783, "step": 6660 }, { "epoch": 0.788975632836527, "grad_norm": 2.4479949474334717, "learning_rate": 2.3228101565328233e-07, "loss": 0.0723, "step": 6670 }, { "epoch": 0.7901585048497752, "grad_norm": 1.982454538345337, "learning_rate": 2.3215675469161043e-07, "loss": 0.0753, "step": 6680 }, { "epoch": 0.7913413768630234, "grad_norm": 2.3341476917266846, "learning_rate": 2.3203249372993851e-07, "loss": 0.0887, "step": 6690 }, { "epoch": 0.7925242488762716, "grad_norm": 3.722738265991211, "learning_rate": 2.3190823276826662e-07, "loss": 0.0857, "step": 6700 }, { "epoch": 0.7937071208895198, "grad_norm": 3.065152645111084, "learning_rate": 2.3178397180659475e-07, "loss": 0.0793, "step": 6710 }, { "epoch": 0.794889992902768, "grad_norm": 2.8583385944366455, "learning_rate": 2.3165971084492286e-07, "loss": 0.0874, "step": 6720 }, { "epoch": 0.796072864916016, "grad_norm": 2.4003095626831055, "learning_rate": 2.3153544988325094e-07, "loss": 0.0753, "step": 6730 }, { "epoch": 0.7972557369292642, "grad_norm": 2.8597030639648438, "learning_rate": 2.3141118892157905e-07, "loss": 0.0738, "step": 6740 }, { "epoch": 0.7984386089425124, "grad_norm": 2.523311138153076, "learning_rate": 2.3128692795990713e-07, "loss": 0.0724, "step": 6750 }, { "epoch": 0.7996214809557606, "grad_norm": 1.9439315795898438, "learning_rate": 2.3116266699823523e-07, "loss": 0.0787, "step": 6760 }, { "epoch": 0.8008043529690088, "grad_norm": 2.8415327072143555, "learning_rate": 2.3103840603656334e-07, "loss": 0.0857, "step": 6770 }, { "epoch": 0.8019872249822569, "grad_norm": 2.7278313636779785, "learning_rate": 2.3091414507489142e-07, "loss": 0.0759, "step": 6780 }, { "epoch": 0.8031700969955051, "grad_norm": 2.6356382369995117, "learning_rate": 2.3078988411321955e-07, "loss": 0.0738, "step": 6790 }, { "epoch": 0.8043529690087533, "grad_norm": 2.935793399810791, "learning_rate": 2.3066562315154766e-07, "loss": 0.0799, "step": 6800 }, { "epoch": 0.8055358410220014, "grad_norm": 2.584515333175659, "learning_rate": 2.3054136218987574e-07, "loss": 0.0891, "step": 6810 }, { "epoch": 0.8067187130352496, "grad_norm": 2.5307719707489014, "learning_rate": 2.3041710122820385e-07, "loss": 0.0821, "step": 6820 }, { "epoch": 0.8079015850484977, "grad_norm": 3.404010057449341, "learning_rate": 2.3029284026653195e-07, "loss": 0.0833, "step": 6830 }, { "epoch": 0.8090844570617459, "grad_norm": 2.7051620483398438, "learning_rate": 2.3016857930486003e-07, "loss": 0.0744, "step": 6840 }, { "epoch": 0.8102673290749941, "grad_norm": 3.3416061401367188, "learning_rate": 2.3004431834318814e-07, "loss": 0.0762, "step": 6850 }, { "epoch": 0.8114502010882423, "grad_norm": 2.6070683002471924, "learning_rate": 2.2992005738151622e-07, "loss": 0.0794, "step": 6860 }, { "epoch": 0.8126330731014905, "grad_norm": 2.4380221366882324, "learning_rate": 2.2979579641984435e-07, "loss": 0.0782, "step": 6870 }, { "epoch": 0.8138159451147385, "grad_norm": 2.362339735031128, "learning_rate": 2.2967153545817246e-07, "loss": 0.0746, "step": 6880 }, { "epoch": 0.8149988171279867, "grad_norm": 2.4958341121673584, "learning_rate": 2.2954727449650054e-07, "loss": 0.0849, "step": 6890 }, { "epoch": 0.8161816891412349, "grad_norm": 3.3361093997955322, "learning_rate": 2.2942301353482864e-07, "loss": 0.0833, "step": 6900 }, { "epoch": 0.8173645611544831, "grad_norm": 2.914919137954712, "learning_rate": 2.2929875257315675e-07, "loss": 0.0792, "step": 6910 }, { "epoch": 0.8185474331677313, "grad_norm": 2.0193259716033936, "learning_rate": 2.2917449161148483e-07, "loss": 0.0773, "step": 6920 }, { "epoch": 0.8197303051809794, "grad_norm": 2.4654836654663086, "learning_rate": 2.2905023064981294e-07, "loss": 0.0789, "step": 6930 }, { "epoch": 0.8209131771942276, "grad_norm": 4.432389259338379, "learning_rate": 2.2892596968814104e-07, "loss": 0.082, "step": 6940 }, { "epoch": 0.8220960492074757, "grad_norm": 3.0357606410980225, "learning_rate": 2.2880170872646915e-07, "loss": 0.086, "step": 6950 }, { "epoch": 0.8232789212207239, "grad_norm": 2.3066818714141846, "learning_rate": 2.2867744776479726e-07, "loss": 0.0826, "step": 6960 }, { "epoch": 0.8244617932339721, "grad_norm": 2.911778450012207, "learning_rate": 2.2855318680312536e-07, "loss": 0.0872, "step": 6970 }, { "epoch": 0.8256446652472202, "grad_norm": 3.3405308723449707, "learning_rate": 2.2842892584145344e-07, "loss": 0.0833, "step": 6980 }, { "epoch": 0.8268275372604684, "grad_norm": 3.870307683944702, "learning_rate": 2.2830466487978155e-07, "loss": 0.0882, "step": 6990 }, { "epoch": 0.8280104092737166, "grad_norm": 3.517186403274536, "learning_rate": 2.2818040391810963e-07, "loss": 0.0731, "step": 7000 }, { "epoch": 0.8291932812869648, "grad_norm": 2.2469067573547363, "learning_rate": 2.2805614295643773e-07, "loss": 0.0828, "step": 7010 }, { "epoch": 0.830376153300213, "grad_norm": 2.6540887355804443, "learning_rate": 2.2793188199476584e-07, "loss": 0.0831, "step": 7020 }, { "epoch": 0.831559025313461, "grad_norm": 2.444990634918213, "learning_rate": 2.2780762103309395e-07, "loss": 0.0785, "step": 7030 }, { "epoch": 0.8327418973267092, "grad_norm": 2.762026786804199, "learning_rate": 2.2768336007142205e-07, "loss": 0.0802, "step": 7040 }, { "epoch": 0.8339247693399574, "grad_norm": 2.296597719192505, "learning_rate": 2.2755909910975016e-07, "loss": 0.0781, "step": 7050 }, { "epoch": 0.8351076413532056, "grad_norm": 2.4298095703125, "learning_rate": 2.2743483814807824e-07, "loss": 0.0827, "step": 7060 }, { "epoch": 0.8362905133664538, "grad_norm": 2.790872812271118, "learning_rate": 2.2731057718640635e-07, "loss": 0.0806, "step": 7070 }, { "epoch": 0.837473385379702, "grad_norm": 2.291260242462158, "learning_rate": 2.2718631622473445e-07, "loss": 0.0796, "step": 7080 }, { "epoch": 0.8386562573929501, "grad_norm": 3.2361810207366943, "learning_rate": 2.2706205526306253e-07, "loss": 0.074, "step": 7090 }, { "epoch": 0.8398391294061982, "grad_norm": 2.27866530418396, "learning_rate": 2.2693779430139064e-07, "loss": 0.0805, "step": 7100 }, { "epoch": 0.8410220014194464, "grad_norm": 3.233788251876831, "learning_rate": 2.2681353333971872e-07, "loss": 0.0825, "step": 7110 }, { "epoch": 0.8422048734326946, "grad_norm": 3.048963785171509, "learning_rate": 2.2668927237804685e-07, "loss": 0.0799, "step": 7120 }, { "epoch": 0.8433877454459427, "grad_norm": 2.551544189453125, "learning_rate": 2.2656501141637496e-07, "loss": 0.0875, "step": 7130 }, { "epoch": 0.8445706174591909, "grad_norm": 3.152825355529785, "learning_rate": 2.2644075045470306e-07, "loss": 0.0839, "step": 7140 }, { "epoch": 0.8457534894724391, "grad_norm": 3.4527697563171387, "learning_rate": 2.2631648949303114e-07, "loss": 0.0829, "step": 7150 }, { "epoch": 0.8469363614856873, "grad_norm": 2.2990529537200928, "learning_rate": 2.2619222853135925e-07, "loss": 0.0952, "step": 7160 }, { "epoch": 0.8481192334989354, "grad_norm": 2.5060782432556152, "learning_rate": 2.2606796756968733e-07, "loss": 0.0731, "step": 7170 }, { "epoch": 0.8493021055121835, "grad_norm": 2.9228427410125732, "learning_rate": 2.2594370660801544e-07, "loss": 0.082, "step": 7180 }, { "epoch": 0.8504849775254317, "grad_norm": 2.887554407119751, "learning_rate": 2.2581944564634354e-07, "loss": 0.0767, "step": 7190 }, { "epoch": 0.8516678495386799, "grad_norm": 4.000696659088135, "learning_rate": 2.2569518468467165e-07, "loss": 0.0778, "step": 7200 }, { "epoch": 0.8528507215519281, "grad_norm": 2.8390395641326904, "learning_rate": 2.2557092372299976e-07, "loss": 0.0852, "step": 7210 }, { "epoch": 0.8540335935651763, "grad_norm": 2.719064950942993, "learning_rate": 2.2544666276132786e-07, "loss": 0.0854, "step": 7220 }, { "epoch": 0.8552164655784245, "grad_norm": 3.0536301136016846, "learning_rate": 2.2532240179965594e-07, "loss": 0.083, "step": 7230 }, { "epoch": 0.8563993375916725, "grad_norm": 2.892967700958252, "learning_rate": 2.2519814083798405e-07, "loss": 0.0768, "step": 7240 }, { "epoch": 0.8575822096049207, "grad_norm": 2.928757905960083, "learning_rate": 2.2507387987631216e-07, "loss": 0.08, "step": 7250 }, { "epoch": 0.8587650816181689, "grad_norm": 2.489948272705078, "learning_rate": 2.2494961891464024e-07, "loss": 0.0797, "step": 7260 }, { "epoch": 0.8599479536314171, "grad_norm": 2.6780011653900146, "learning_rate": 2.2482535795296834e-07, "loss": 0.0775, "step": 7270 }, { "epoch": 0.8611308256446653, "grad_norm": 1.9068881273269653, "learning_rate": 2.2470109699129647e-07, "loss": 0.0794, "step": 7280 }, { "epoch": 0.8623136976579134, "grad_norm": 2.4795637130737305, "learning_rate": 2.2457683602962455e-07, "loss": 0.0769, "step": 7290 }, { "epoch": 0.8634965696711616, "grad_norm": 2.28903865814209, "learning_rate": 2.2445257506795266e-07, "loss": 0.0748, "step": 7300 }, { "epoch": 0.8646794416844098, "grad_norm": 2.298031806945801, "learning_rate": 2.2432831410628074e-07, "loss": 0.0846, "step": 7310 }, { "epoch": 0.8658623136976579, "grad_norm": 2.407175064086914, "learning_rate": 2.2420405314460885e-07, "loss": 0.0774, "step": 7320 }, { "epoch": 0.867045185710906, "grad_norm": 2.5083131790161133, "learning_rate": 2.2407979218293695e-07, "loss": 0.0775, "step": 7330 }, { "epoch": 0.8682280577241542, "grad_norm": 2.5747714042663574, "learning_rate": 2.2395553122126503e-07, "loss": 0.0805, "step": 7340 }, { "epoch": 0.8694109297374024, "grad_norm": 4.342471599578857, "learning_rate": 2.2383127025959314e-07, "loss": 0.0667, "step": 7350 }, { "epoch": 0.8705938017506506, "grad_norm": 2.5876171588897705, "learning_rate": 2.2370700929792127e-07, "loss": 0.0783, "step": 7360 }, { "epoch": 0.8717766737638988, "grad_norm": 2.095641613006592, "learning_rate": 2.2358274833624935e-07, "loss": 0.0828, "step": 7370 }, { "epoch": 0.872959545777147, "grad_norm": 2.0888924598693848, "learning_rate": 2.2345848737457746e-07, "loss": 0.0798, "step": 7380 }, { "epoch": 0.874142417790395, "grad_norm": 2.818875312805176, "learning_rate": 2.2333422641290557e-07, "loss": 0.0842, "step": 7390 }, { "epoch": 0.8753252898036432, "grad_norm": 2.1577043533325195, "learning_rate": 2.2320996545123365e-07, "loss": 0.088, "step": 7400 }, { "epoch": 0.8765081618168914, "grad_norm": 1.7401179075241089, "learning_rate": 2.2308570448956175e-07, "loss": 0.0782, "step": 7410 }, { "epoch": 0.8776910338301396, "grad_norm": 3.1824846267700195, "learning_rate": 2.2296144352788983e-07, "loss": 0.0817, "step": 7420 }, { "epoch": 0.8788739058433878, "grad_norm": 3.0710573196411133, "learning_rate": 2.2283718256621794e-07, "loss": 0.0781, "step": 7430 }, { "epoch": 0.8800567778566359, "grad_norm": 3.4253475666046143, "learning_rate": 2.2271292160454607e-07, "loss": 0.0848, "step": 7440 }, { "epoch": 0.8812396498698841, "grad_norm": 2.1270899772644043, "learning_rate": 2.2258866064287415e-07, "loss": 0.0868, "step": 7450 }, { "epoch": 0.8824225218831322, "grad_norm": 2.9918246269226074, "learning_rate": 2.2246439968120226e-07, "loss": 0.0806, "step": 7460 }, { "epoch": 0.8836053938963804, "grad_norm": 2.0600199699401855, "learning_rate": 2.2234013871953036e-07, "loss": 0.0832, "step": 7470 }, { "epoch": 0.8847882659096286, "grad_norm": 3.150747060775757, "learning_rate": 2.2221587775785844e-07, "loss": 0.0816, "step": 7480 }, { "epoch": 0.8859711379228767, "grad_norm": 3.5105652809143066, "learning_rate": 2.2209161679618655e-07, "loss": 0.084, "step": 7490 }, { "epoch": 0.8871540099361249, "grad_norm": 2.3190805912017822, "learning_rate": 2.2196735583451466e-07, "loss": 0.0743, "step": 7500 }, { "epoch": 0.8883368819493731, "grad_norm": 3.28234601020813, "learning_rate": 2.2184309487284274e-07, "loss": 0.0749, "step": 7510 }, { "epoch": 0.8895197539626213, "grad_norm": 2.5282111167907715, "learning_rate": 2.2171883391117084e-07, "loss": 0.0792, "step": 7520 }, { "epoch": 0.8907026259758695, "grad_norm": 2.2714719772338867, "learning_rate": 2.2159457294949898e-07, "loss": 0.0753, "step": 7530 }, { "epoch": 0.8918854979891175, "grad_norm": 2.893171787261963, "learning_rate": 2.2147031198782706e-07, "loss": 0.0731, "step": 7540 }, { "epoch": 0.8930683700023657, "grad_norm": 2.5414984226226807, "learning_rate": 2.2134605102615516e-07, "loss": 0.0799, "step": 7550 }, { "epoch": 0.8942512420156139, "grad_norm": 2.9286720752716064, "learning_rate": 2.2122179006448327e-07, "loss": 0.0865, "step": 7560 }, { "epoch": 0.8954341140288621, "grad_norm": 2.2986555099487305, "learning_rate": 2.2109752910281135e-07, "loss": 0.0813, "step": 7570 }, { "epoch": 0.8966169860421103, "grad_norm": 3.054816961288452, "learning_rate": 2.2097326814113945e-07, "loss": 0.0738, "step": 7580 }, { "epoch": 0.8977998580553584, "grad_norm": 2.8551993370056152, "learning_rate": 2.2084900717946753e-07, "loss": 0.0746, "step": 7590 }, { "epoch": 0.8989827300686066, "grad_norm": 2.458814859390259, "learning_rate": 2.2072474621779564e-07, "loss": 0.0754, "step": 7600 }, { "epoch": 0.9001656020818547, "grad_norm": 4.028676986694336, "learning_rate": 2.2060048525612377e-07, "loss": 0.0853, "step": 7610 }, { "epoch": 0.9013484740951029, "grad_norm": 2.35184383392334, "learning_rate": 2.2047622429445185e-07, "loss": 0.0751, "step": 7620 }, { "epoch": 0.9025313461083511, "grad_norm": 3.0126266479492188, "learning_rate": 2.2035196333277996e-07, "loss": 0.0871, "step": 7630 }, { "epoch": 0.9037142181215992, "grad_norm": 2.51008677482605, "learning_rate": 2.2022770237110807e-07, "loss": 0.072, "step": 7640 }, { "epoch": 0.9048970901348474, "grad_norm": 4.494469165802002, "learning_rate": 2.2010344140943615e-07, "loss": 0.0807, "step": 7650 }, { "epoch": 0.9060799621480956, "grad_norm": 3.058258295059204, "learning_rate": 2.1997918044776425e-07, "loss": 0.0737, "step": 7660 }, { "epoch": 0.9072628341613438, "grad_norm": 2.8899824619293213, "learning_rate": 2.1985491948609236e-07, "loss": 0.081, "step": 7670 }, { "epoch": 0.9084457061745919, "grad_norm": 3.1135807037353516, "learning_rate": 2.1973065852442044e-07, "loss": 0.0758, "step": 7680 }, { "epoch": 0.90962857818784, "grad_norm": 2.943168878555298, "learning_rate": 2.1960639756274857e-07, "loss": 0.0734, "step": 7690 }, { "epoch": 0.9108114502010882, "grad_norm": 4.5543951988220215, "learning_rate": 2.1948213660107668e-07, "loss": 0.0801, "step": 7700 }, { "epoch": 0.9119943222143364, "grad_norm": 2.273134708404541, "learning_rate": 2.1935787563940476e-07, "loss": 0.0809, "step": 7710 }, { "epoch": 0.9131771942275846, "grad_norm": 2.7595739364624023, "learning_rate": 2.1923361467773286e-07, "loss": 0.0793, "step": 7720 }, { "epoch": 0.9143600662408328, "grad_norm": 3.4362125396728516, "learning_rate": 2.1910935371606094e-07, "loss": 0.0884, "step": 7730 }, { "epoch": 0.915542938254081, "grad_norm": 2.5967423915863037, "learning_rate": 2.1898509275438905e-07, "loss": 0.0763, "step": 7740 }, { "epoch": 0.916725810267329, "grad_norm": 3.411163330078125, "learning_rate": 2.1886083179271716e-07, "loss": 0.0776, "step": 7750 }, { "epoch": 0.9179086822805772, "grad_norm": 3.076286554336548, "learning_rate": 2.1873657083104524e-07, "loss": 0.091, "step": 7760 }, { "epoch": 0.9190915542938254, "grad_norm": 2.6959290504455566, "learning_rate": 2.1861230986937337e-07, "loss": 0.0809, "step": 7770 }, { "epoch": 0.9202744263070736, "grad_norm": 1.869222640991211, "learning_rate": 2.1848804890770148e-07, "loss": 0.0734, "step": 7780 }, { "epoch": 0.9214572983203217, "grad_norm": 4.628353118896484, "learning_rate": 2.1836378794602956e-07, "loss": 0.0786, "step": 7790 }, { "epoch": 0.9226401703335699, "grad_norm": 3.9674925804138184, "learning_rate": 2.1823952698435766e-07, "loss": 0.0786, "step": 7800 }, { "epoch": 0.9238230423468181, "grad_norm": 1.9479893445968628, "learning_rate": 2.1811526602268577e-07, "loss": 0.1046, "step": 7810 }, { "epoch": 0.9250059143600663, "grad_norm": 2.150958776473999, "learning_rate": 2.1799100506101385e-07, "loss": 0.0771, "step": 7820 }, { "epoch": 0.9261887863733144, "grad_norm": 2.4033443927764893, "learning_rate": 2.1786674409934196e-07, "loss": 0.0825, "step": 7830 }, { "epoch": 0.9273716583865625, "grad_norm": 2.5699331760406494, "learning_rate": 2.1774248313767004e-07, "loss": 0.0785, "step": 7840 }, { "epoch": 0.9285545303998107, "grad_norm": 1.7450093030929565, "learning_rate": 2.1761822217599814e-07, "loss": 0.0716, "step": 7850 }, { "epoch": 0.9297374024130589, "grad_norm": 1.849411964416504, "learning_rate": 2.1749396121432627e-07, "loss": 0.0798, "step": 7860 }, { "epoch": 0.9309202744263071, "grad_norm": 3.070634603500366, "learning_rate": 2.1736970025265435e-07, "loss": 0.0801, "step": 7870 }, { "epoch": 0.9321031464395553, "grad_norm": 3.1070401668548584, "learning_rate": 2.1724543929098246e-07, "loss": 0.0786, "step": 7880 }, { "epoch": 0.9332860184528035, "grad_norm": 2.6870181560516357, "learning_rate": 2.1712117832931057e-07, "loss": 0.0839, "step": 7890 }, { "epoch": 0.9344688904660515, "grad_norm": 3.3481647968292236, "learning_rate": 2.1699691736763865e-07, "loss": 0.073, "step": 7900 }, { "epoch": 0.9356517624792997, "grad_norm": 3.239243268966675, "learning_rate": 2.1687265640596675e-07, "loss": 0.0837, "step": 7910 }, { "epoch": 0.9368346344925479, "grad_norm": 2.576031446456909, "learning_rate": 2.1674839544429486e-07, "loss": 0.0765, "step": 7920 }, { "epoch": 0.9380175065057961, "grad_norm": 1.8821099996566772, "learning_rate": 2.1662413448262294e-07, "loss": 0.0739, "step": 7930 }, { "epoch": 0.9392003785190443, "grad_norm": 2.638596296310425, "learning_rate": 2.1649987352095107e-07, "loss": 0.0832, "step": 7940 }, { "epoch": 0.9403832505322924, "grad_norm": 2.7849528789520264, "learning_rate": 2.1637561255927918e-07, "loss": 0.0824, "step": 7950 }, { "epoch": 0.9415661225455406, "grad_norm": 2.303889513015747, "learning_rate": 2.1625135159760726e-07, "loss": 0.0844, "step": 7960 }, { "epoch": 0.9427489945587887, "grad_norm": 2.3808207511901855, "learning_rate": 2.1612709063593537e-07, "loss": 0.0827, "step": 7970 }, { "epoch": 0.9439318665720369, "grad_norm": 3.5343451499938965, "learning_rate": 2.1600282967426347e-07, "loss": 0.0818, "step": 7980 }, { "epoch": 0.945114738585285, "grad_norm": 3.383157730102539, "learning_rate": 2.1587856871259155e-07, "loss": 0.0765, "step": 7990 }, { "epoch": 0.9462976105985332, "grad_norm": 2.465620279312134, "learning_rate": 2.1575430775091966e-07, "loss": 0.0708, "step": 8000 }, { "epoch": 0.9474804826117814, "grad_norm": 2.058793783187866, "learning_rate": 2.1563004678924774e-07, "loss": 0.0809, "step": 8010 }, { "epoch": 0.9486633546250296, "grad_norm": 2.2567756175994873, "learning_rate": 2.1550578582757587e-07, "loss": 0.0742, "step": 8020 }, { "epoch": 0.9498462266382778, "grad_norm": 2.2249648571014404, "learning_rate": 2.1538152486590398e-07, "loss": 0.0774, "step": 8030 }, { "epoch": 0.9510290986515259, "grad_norm": 1.9269020557403564, "learning_rate": 2.1525726390423206e-07, "loss": 0.0685, "step": 8040 }, { "epoch": 0.952211970664774, "grad_norm": 3.586350917816162, "learning_rate": 2.1513300294256016e-07, "loss": 0.0761, "step": 8050 }, { "epoch": 0.9533948426780222, "grad_norm": 1.7848161458969116, "learning_rate": 2.1500874198088827e-07, "loss": 0.0707, "step": 8060 }, { "epoch": 0.9545777146912704, "grad_norm": 2.5594489574432373, "learning_rate": 2.1488448101921635e-07, "loss": 0.0787, "step": 8070 }, { "epoch": 0.9557605867045186, "grad_norm": 3.096752166748047, "learning_rate": 2.1476022005754446e-07, "loss": 0.0791, "step": 8080 }, { "epoch": 0.9569434587177668, "grad_norm": 1.9905550479888916, "learning_rate": 2.1463595909587256e-07, "loss": 0.0761, "step": 8090 }, { "epoch": 0.9581263307310149, "grad_norm": 2.2595865726470947, "learning_rate": 2.1451169813420067e-07, "loss": 0.0833, "step": 8100 }, { "epoch": 0.9593092027442631, "grad_norm": 2.240837574005127, "learning_rate": 2.1438743717252878e-07, "loss": 0.0783, "step": 8110 }, { "epoch": 0.9604920747575112, "grad_norm": 2.9329092502593994, "learning_rate": 2.1426317621085688e-07, "loss": 0.0761, "step": 8120 }, { "epoch": 0.9616749467707594, "grad_norm": 2.6702888011932373, "learning_rate": 2.1413891524918496e-07, "loss": 0.0749, "step": 8130 }, { "epoch": 0.9628578187840076, "grad_norm": 2.5349113941192627, "learning_rate": 2.1401465428751307e-07, "loss": 0.0703, "step": 8140 }, { "epoch": 0.9640406907972557, "grad_norm": 2.419372797012329, "learning_rate": 2.1389039332584115e-07, "loss": 0.0822, "step": 8150 }, { "epoch": 0.9652235628105039, "grad_norm": 1.7324497699737549, "learning_rate": 2.1376613236416925e-07, "loss": 0.0756, "step": 8160 }, { "epoch": 0.9664064348237521, "grad_norm": 2.037790298461914, "learning_rate": 2.1364187140249736e-07, "loss": 0.0767, "step": 8170 }, { "epoch": 0.9675893068370003, "grad_norm": 2.7648956775665283, "learning_rate": 2.1351761044082547e-07, "loss": 0.0769, "step": 8180 }, { "epoch": 0.9687721788502484, "grad_norm": 4.684494972229004, "learning_rate": 2.1339334947915357e-07, "loss": 0.087, "step": 8190 }, { "epoch": 0.9699550508634965, "grad_norm": 1.8626960515975952, "learning_rate": 2.1326908851748168e-07, "loss": 0.0776, "step": 8200 }, { "epoch": 0.9711379228767447, "grad_norm": 1.9082026481628418, "learning_rate": 2.1314482755580976e-07, "loss": 0.0807, "step": 8210 }, { "epoch": 0.9723207948899929, "grad_norm": 2.220022201538086, "learning_rate": 2.1302056659413787e-07, "loss": 0.0781, "step": 8220 }, { "epoch": 0.9735036669032411, "grad_norm": 2.570674419403076, "learning_rate": 2.1289630563246597e-07, "loss": 0.0905, "step": 8230 }, { "epoch": 0.9746865389164893, "grad_norm": 2.683723211288452, "learning_rate": 2.1277204467079405e-07, "loss": 0.0886, "step": 8240 }, { "epoch": 0.9758694109297374, "grad_norm": 2.6624467372894287, "learning_rate": 2.1264778370912216e-07, "loss": 0.0776, "step": 8250 }, { "epoch": 0.9770522829429855, "grad_norm": 1.9196579456329346, "learning_rate": 2.1252352274745024e-07, "loss": 0.0774, "step": 8260 }, { "epoch": 0.9782351549562337, "grad_norm": 3.2187459468841553, "learning_rate": 2.1239926178577837e-07, "loss": 0.0838, "step": 8270 }, { "epoch": 0.9794180269694819, "grad_norm": 2.451727867126465, "learning_rate": 2.1227500082410648e-07, "loss": 0.0799, "step": 8280 }, { "epoch": 0.9806008989827301, "grad_norm": 2.8609373569488525, "learning_rate": 2.1215073986243456e-07, "loss": 0.0774, "step": 8290 }, { "epoch": 0.9817837709959782, "grad_norm": 1.4480849504470825, "learning_rate": 2.1202647890076266e-07, "loss": 0.0791, "step": 8300 }, { "epoch": 0.9829666430092264, "grad_norm": 3.00162672996521, "learning_rate": 2.1190221793909077e-07, "loss": 0.0861, "step": 8310 }, { "epoch": 0.9841495150224746, "grad_norm": 2.522026777267456, "learning_rate": 2.1177795697741885e-07, "loss": 0.0788, "step": 8320 }, { "epoch": 0.9853323870357228, "grad_norm": 2.7136173248291016, "learning_rate": 2.1165369601574696e-07, "loss": 0.0798, "step": 8330 }, { "epoch": 0.9865152590489709, "grad_norm": 2.488757848739624, "learning_rate": 2.1152943505407506e-07, "loss": 0.0799, "step": 8340 }, { "epoch": 0.987698131062219, "grad_norm": 2.8211421966552734, "learning_rate": 2.1140517409240317e-07, "loss": 0.0798, "step": 8350 }, { "epoch": 0.9888810030754672, "grad_norm": 2.1966054439544678, "learning_rate": 2.1128091313073128e-07, "loss": 0.087, "step": 8360 }, { "epoch": 0.9900638750887154, "grad_norm": 2.5527126789093018, "learning_rate": 2.1115665216905938e-07, "loss": 0.0822, "step": 8370 }, { "epoch": 0.9912467471019636, "grad_norm": 2.126210927963257, "learning_rate": 2.1103239120738746e-07, "loss": 0.0705, "step": 8380 }, { "epoch": 0.9924296191152118, "grad_norm": 3.0045626163482666, "learning_rate": 2.1090813024571557e-07, "loss": 0.085, "step": 8390 }, { "epoch": 0.99361249112846, "grad_norm": 4.732204437255859, "learning_rate": 2.1078386928404368e-07, "loss": 0.0698, "step": 8400 }, { "epoch": 0.994795363141708, "grad_norm": 2.096775770187378, "learning_rate": 2.1065960832237176e-07, "loss": 0.0729, "step": 8410 }, { "epoch": 0.9959782351549562, "grad_norm": 2.84159255027771, "learning_rate": 2.1053534736069986e-07, "loss": 0.0713, "step": 8420 }, { "epoch": 0.9971611071682044, "grad_norm": 2.7287333011627197, "learning_rate": 2.10411086399028e-07, "loss": 0.0732, "step": 8430 }, { "epoch": 0.9983439791814526, "grad_norm": 2.5874195098876953, "learning_rate": 2.1028682543735607e-07, "loss": 0.0795, "step": 8440 }, { "epoch": 0.9995268511947007, "grad_norm": 2.5173277854919434, "learning_rate": 2.1016256447568418e-07, "loss": 0.0688, "step": 8450 }, { "epoch": 1.0, "eval_accuracy": 0.6890907276175267, "eval_flagged/accuracy": 0.8491366403832717, "eval_flagged/f1": 0.8590259750353639, "eval_flagged/precision": 0.8947862694300518, "eval_flagged/recall": 0.8260141699799707, "eval_loss": 0.07985769212245941, "eval_macro_f1": 0.6367365019447472, "eval_macro_precision": 0.7276364892531203, "eval_macro_recall": 0.5930679903194713, "eval_micro_f1": 0.7463962979503797, "eval_micro_precision": 0.8014978980449777, "eval_micro_recall": 0.6983836264381101, "eval_runtime": 86.7593, "eval_samples_per_second": 692.882, "eval_steps_per_second": 5.417, "step": 8454 }, { "epoch": 1.000709723207949, "grad_norm": 2.0986499786376953, "learning_rate": 2.1003830351401226e-07, "loss": 0.0826, "step": 8460 }, { "epoch": 1.001892595221197, "grad_norm": 4.088377952575684, "learning_rate": 2.0991404255234037e-07, "loss": 0.0707, "step": 8470 }, { "epoch": 1.0030754672344453, "grad_norm": 3.5084316730499268, "learning_rate": 2.0978978159066847e-07, "loss": 0.073, "step": 8480 }, { "epoch": 1.0042583392476934, "grad_norm": 2.6616859436035156, "learning_rate": 2.0966552062899655e-07, "loss": 0.0785, "step": 8490 }, { "epoch": 1.0054412112609417, "grad_norm": 1.8741695880889893, "learning_rate": 2.0954125966732466e-07, "loss": 0.078, "step": 8500 }, { "epoch": 1.0066240832741897, "grad_norm": 2.7200450897216797, "learning_rate": 2.094169987056528e-07, "loss": 0.0803, "step": 8510 }, { "epoch": 1.0078069552874378, "grad_norm": 2.6367361545562744, "learning_rate": 2.0929273774398087e-07, "loss": 0.0756, "step": 8520 }, { "epoch": 1.008989827300686, "grad_norm": 2.9308815002441406, "learning_rate": 2.0916847678230898e-07, "loss": 0.0763, "step": 8530 }, { "epoch": 1.0101726993139342, "grad_norm": 2.5258383750915527, "learning_rate": 2.0904421582063709e-07, "loss": 0.0753, "step": 8540 }, { "epoch": 1.0113555713271825, "grad_norm": 2.3440134525299072, "learning_rate": 2.0891995485896517e-07, "loss": 0.083, "step": 8550 }, { "epoch": 1.0125384433404305, "grad_norm": 3.7482261657714844, "learning_rate": 2.0879569389729327e-07, "loss": 0.0748, "step": 8560 }, { "epoch": 1.0137213153536788, "grad_norm": 2.6435649394989014, "learning_rate": 2.0867143293562135e-07, "loss": 0.0705, "step": 8570 }, { "epoch": 1.014904187366927, "grad_norm": 2.5334279537200928, "learning_rate": 2.0854717197394946e-07, "loss": 0.0819, "step": 8580 }, { "epoch": 1.016087059380175, "grad_norm": 3.1520018577575684, "learning_rate": 2.084229110122776e-07, "loss": 0.0741, "step": 8590 }, { "epoch": 1.0172699313934233, "grad_norm": 2.3893392086029053, "learning_rate": 2.0829865005060567e-07, "loss": 0.0801, "step": 8600 }, { "epoch": 1.0184528034066713, "grad_norm": 2.594447135925293, "learning_rate": 2.0817438908893378e-07, "loss": 0.0801, "step": 8610 }, { "epoch": 1.0196356754199196, "grad_norm": 2.9921815395355225, "learning_rate": 2.0805012812726188e-07, "loss": 0.072, "step": 8620 }, { "epoch": 1.0208185474331677, "grad_norm": 3.3478755950927734, "learning_rate": 2.0792586716558996e-07, "loss": 0.0803, "step": 8630 }, { "epoch": 1.022001419446416, "grad_norm": 2.3052449226379395, "learning_rate": 2.0780160620391807e-07, "loss": 0.0734, "step": 8640 }, { "epoch": 1.023184291459664, "grad_norm": 2.7590808868408203, "learning_rate": 2.0767734524224618e-07, "loss": 0.0827, "step": 8650 }, { "epoch": 1.0243671634729121, "grad_norm": 2.637211799621582, "learning_rate": 2.0755308428057426e-07, "loss": 0.073, "step": 8660 }, { "epoch": 1.0255500354861604, "grad_norm": 3.336412191390991, "learning_rate": 2.0742882331890236e-07, "loss": 0.0733, "step": 8670 }, { "epoch": 1.0267329074994085, "grad_norm": 2.9917032718658447, "learning_rate": 2.073045623572305e-07, "loss": 0.0698, "step": 8680 }, { "epoch": 1.0279157795126568, "grad_norm": 2.1019673347473145, "learning_rate": 2.0718030139555858e-07, "loss": 0.0746, "step": 8690 }, { "epoch": 1.0290986515259049, "grad_norm": 2.375352382659912, "learning_rate": 2.0705604043388668e-07, "loss": 0.0761, "step": 8700 }, { "epoch": 1.0302815235391531, "grad_norm": 3.2611489295959473, "learning_rate": 2.0693177947221476e-07, "loss": 0.0751, "step": 8710 }, { "epoch": 1.0314643955524012, "grad_norm": 3.405688524246216, "learning_rate": 2.0680751851054287e-07, "loss": 0.0768, "step": 8720 }, { "epoch": 1.0326472675656495, "grad_norm": 2.9047107696533203, "learning_rate": 2.0668325754887098e-07, "loss": 0.0722, "step": 8730 }, { "epoch": 1.0338301395788976, "grad_norm": 4.075443744659424, "learning_rate": 2.0655899658719905e-07, "loss": 0.0653, "step": 8740 }, { "epoch": 1.0350130115921456, "grad_norm": 2.385829448699951, "learning_rate": 2.0643473562552716e-07, "loss": 0.0725, "step": 8750 }, { "epoch": 1.036195883605394, "grad_norm": 2.719733476638794, "learning_rate": 2.063104746638553e-07, "loss": 0.0812, "step": 8760 }, { "epoch": 1.037378755618642, "grad_norm": 2.481539011001587, "learning_rate": 2.0618621370218337e-07, "loss": 0.0729, "step": 8770 }, { "epoch": 1.0385616276318903, "grad_norm": 3.510097026824951, "learning_rate": 2.0606195274051148e-07, "loss": 0.0714, "step": 8780 }, { "epoch": 1.0397444996451384, "grad_norm": 11.504462242126465, "learning_rate": 2.059376917788396e-07, "loss": 0.0743, "step": 8790 }, { "epoch": 1.0409273716583867, "grad_norm": 1.6750767230987549, "learning_rate": 2.0581343081716767e-07, "loss": 0.0773, "step": 8800 }, { "epoch": 1.0421102436716347, "grad_norm": 2.878331422805786, "learning_rate": 2.0568916985549577e-07, "loss": 0.0759, "step": 8810 }, { "epoch": 1.0432931156848828, "grad_norm": 3.340073823928833, "learning_rate": 2.0556490889382388e-07, "loss": 0.0774, "step": 8820 }, { "epoch": 1.044475987698131, "grad_norm": 3.5718281269073486, "learning_rate": 2.0544064793215196e-07, "loss": 0.076, "step": 8830 }, { "epoch": 1.0456588597113792, "grad_norm": 2.3918213844299316, "learning_rate": 2.053163869704801e-07, "loss": 0.0728, "step": 8840 }, { "epoch": 1.0468417317246275, "grad_norm": 2.0760793685913086, "learning_rate": 2.051921260088082e-07, "loss": 0.0788, "step": 8850 }, { "epoch": 1.0480246037378755, "grad_norm": 2.4933981895446777, "learning_rate": 2.0506786504713628e-07, "loss": 0.0764, "step": 8860 }, { "epoch": 1.0492074757511238, "grad_norm": 2.8113608360290527, "learning_rate": 2.0494360408546439e-07, "loss": 0.0784, "step": 8870 }, { "epoch": 1.050390347764372, "grad_norm": 3.1028501987457275, "learning_rate": 2.0481934312379247e-07, "loss": 0.0707, "step": 8880 }, { "epoch": 1.05157321977762, "grad_norm": 2.622908592224121, "learning_rate": 2.0469508216212057e-07, "loss": 0.0806, "step": 8890 }, { "epoch": 1.0527560917908683, "grad_norm": 2.0278289318084717, "learning_rate": 2.0457082120044868e-07, "loss": 0.0841, "step": 8900 }, { "epoch": 1.0539389638041163, "grad_norm": 4.181419849395752, "learning_rate": 2.0444656023877676e-07, "loss": 0.0741, "step": 8910 }, { "epoch": 1.0551218358173646, "grad_norm": 2.5596425533294678, "learning_rate": 2.043222992771049e-07, "loss": 0.0708, "step": 8920 }, { "epoch": 1.0563047078306127, "grad_norm": 3.4493565559387207, "learning_rate": 2.04198038315433e-07, "loss": 0.0771, "step": 8930 }, { "epoch": 1.057487579843861, "grad_norm": 2.5968263149261475, "learning_rate": 2.0407377735376108e-07, "loss": 0.0773, "step": 8940 }, { "epoch": 1.058670451857109, "grad_norm": 2.8938093185424805, "learning_rate": 2.0394951639208918e-07, "loss": 0.0716, "step": 8950 }, { "epoch": 1.0598533238703571, "grad_norm": 2.6362671852111816, "learning_rate": 2.038252554304173e-07, "loss": 0.0759, "step": 8960 }, { "epoch": 1.0610361958836054, "grad_norm": 2.409360408782959, "learning_rate": 2.0370099446874537e-07, "loss": 0.0816, "step": 8970 }, { "epoch": 1.0622190678968535, "grad_norm": 1.821694254875183, "learning_rate": 2.0357673350707348e-07, "loss": 0.0777, "step": 8980 }, { "epoch": 1.0634019399101018, "grad_norm": 2.149247884750366, "learning_rate": 2.0345247254540156e-07, "loss": 0.0835, "step": 8990 }, { "epoch": 1.0645848119233499, "grad_norm": 1.8219929933547974, "learning_rate": 2.033282115837297e-07, "loss": 0.0797, "step": 9000 }, { "epoch": 1.0657676839365982, "grad_norm": 2.1583893299102783, "learning_rate": 2.032039506220578e-07, "loss": 0.0789, "step": 9010 }, { "epoch": 1.0669505559498462, "grad_norm": 3.287797451019287, "learning_rate": 2.0307968966038588e-07, "loss": 0.0782, "step": 9020 }, { "epoch": 1.0681334279630943, "grad_norm": 2.53070330619812, "learning_rate": 2.0295542869871398e-07, "loss": 0.0722, "step": 9030 }, { "epoch": 1.0693162999763426, "grad_norm": 3.0235517024993896, "learning_rate": 2.028311677370421e-07, "loss": 0.0684, "step": 9040 }, { "epoch": 1.0704991719895907, "grad_norm": 1.7872607707977295, "learning_rate": 2.0270690677537017e-07, "loss": 0.0793, "step": 9050 }, { "epoch": 1.071682044002839, "grad_norm": 2.859154224395752, "learning_rate": 2.0258264581369827e-07, "loss": 0.0761, "step": 9060 }, { "epoch": 1.072864916016087, "grad_norm": 4.579363822937012, "learning_rate": 2.0245838485202638e-07, "loss": 0.0798, "step": 9070 }, { "epoch": 1.0740477880293353, "grad_norm": 2.6780917644500732, "learning_rate": 2.0233412389035446e-07, "loss": 0.0753, "step": 9080 }, { "epoch": 1.0752306600425834, "grad_norm": 1.9808409214019775, "learning_rate": 2.022098629286826e-07, "loss": 0.0741, "step": 9090 }, { "epoch": 1.0764135320558315, "grad_norm": 2.2187116146087646, "learning_rate": 2.020856019670107e-07, "loss": 0.0695, "step": 9100 }, { "epoch": 1.0775964040690797, "grad_norm": 2.2176706790924072, "learning_rate": 2.0196134100533878e-07, "loss": 0.0724, "step": 9110 }, { "epoch": 1.0787792760823278, "grad_norm": 2.8089518547058105, "learning_rate": 2.0183708004366689e-07, "loss": 0.0786, "step": 9120 }, { "epoch": 1.0799621480955761, "grad_norm": 1.919397234916687, "learning_rate": 2.0171281908199497e-07, "loss": 0.0779, "step": 9130 }, { "epoch": 1.0811450201088242, "grad_norm": 2.3830599784851074, "learning_rate": 2.0158855812032307e-07, "loss": 0.0729, "step": 9140 }, { "epoch": 1.0823278921220725, "grad_norm": 1.6706242561340332, "learning_rate": 2.0146429715865118e-07, "loss": 0.0778, "step": 9150 }, { "epoch": 1.0835107641353205, "grad_norm": 2.791943073272705, "learning_rate": 2.0134003619697926e-07, "loss": 0.0783, "step": 9160 }, { "epoch": 1.0846936361485686, "grad_norm": 5.174086093902588, "learning_rate": 2.012157752353074e-07, "loss": 0.0783, "step": 9170 }, { "epoch": 1.085876508161817, "grad_norm": 3.0054752826690674, "learning_rate": 2.010915142736355e-07, "loss": 0.08, "step": 9180 }, { "epoch": 1.087059380175065, "grad_norm": 2.2934484481811523, "learning_rate": 2.0096725331196358e-07, "loss": 0.0797, "step": 9190 }, { "epoch": 1.0882422521883133, "grad_norm": 1.789128303527832, "learning_rate": 2.0084299235029168e-07, "loss": 0.08, "step": 9200 }, { "epoch": 1.0894251242015613, "grad_norm": 3.193634271621704, "learning_rate": 2.007187313886198e-07, "loss": 0.0748, "step": 9210 }, { "epoch": 1.0906079962148096, "grad_norm": 3.3913214206695557, "learning_rate": 2.0059447042694787e-07, "loss": 0.0742, "step": 9220 }, { "epoch": 1.0917908682280577, "grad_norm": 3.5068588256835938, "learning_rate": 2.0047020946527598e-07, "loss": 0.0665, "step": 9230 }, { "epoch": 1.0929737402413058, "grad_norm": 2.5233516693115234, "learning_rate": 2.0034594850360408e-07, "loss": 0.0814, "step": 9240 }, { "epoch": 1.094156612254554, "grad_norm": 2.2516121864318848, "learning_rate": 2.002216875419322e-07, "loss": 0.0869, "step": 9250 }, { "epoch": 1.0953394842678021, "grad_norm": 2.3393759727478027, "learning_rate": 2.000974265802603e-07, "loss": 0.0814, "step": 9260 }, { "epoch": 1.0965223562810504, "grad_norm": 4.100457668304443, "learning_rate": 1.999731656185884e-07, "loss": 0.0828, "step": 9270 }, { "epoch": 1.0977052282942985, "grad_norm": 3.160980701446533, "learning_rate": 1.9984890465691648e-07, "loss": 0.0726, "step": 9280 }, { "epoch": 1.0988881003075468, "grad_norm": 1.7075315713882446, "learning_rate": 1.997246436952446e-07, "loss": 0.0745, "step": 9290 }, { "epoch": 1.1000709723207949, "grad_norm": 2.9399731159210205, "learning_rate": 1.9960038273357267e-07, "loss": 0.0789, "step": 9300 }, { "epoch": 1.101253844334043, "grad_norm": 3.025312662124634, "learning_rate": 1.9947612177190078e-07, "loss": 0.0762, "step": 9310 }, { "epoch": 1.1024367163472912, "grad_norm": 2.2980003356933594, "learning_rate": 1.9935186081022888e-07, "loss": 0.0735, "step": 9320 }, { "epoch": 1.1036195883605393, "grad_norm": 2.6134819984436035, "learning_rate": 1.99227599848557e-07, "loss": 0.0696, "step": 9330 }, { "epoch": 1.1048024603737876, "grad_norm": 3.3714373111724854, "learning_rate": 1.991033388868851e-07, "loss": 0.0814, "step": 9340 }, { "epoch": 1.1059853323870357, "grad_norm": 2.004326581954956, "learning_rate": 1.989790779252132e-07, "loss": 0.0709, "step": 9350 }, { "epoch": 1.107168204400284, "grad_norm": 2.5760440826416016, "learning_rate": 1.9885481696354128e-07, "loss": 0.0733, "step": 9360 }, { "epoch": 1.108351076413532, "grad_norm": 3.0965304374694824, "learning_rate": 1.987305560018694e-07, "loss": 0.0728, "step": 9370 }, { "epoch": 1.1095339484267803, "grad_norm": 3.4987235069274902, "learning_rate": 1.986062950401975e-07, "loss": 0.075, "step": 9380 }, { "epoch": 1.1107168204400284, "grad_norm": 2.633436679840088, "learning_rate": 1.9848203407852557e-07, "loss": 0.0858, "step": 9390 }, { "epoch": 1.1118996924532765, "grad_norm": 2.269353151321411, "learning_rate": 1.9835777311685368e-07, "loss": 0.0777, "step": 9400 }, { "epoch": 1.1130825644665248, "grad_norm": 3.1884210109710693, "learning_rate": 1.9823351215518176e-07, "loss": 0.0875, "step": 9410 }, { "epoch": 1.1142654364797728, "grad_norm": 3.2863757610321045, "learning_rate": 1.981092511935099e-07, "loss": 0.0771, "step": 9420 }, { "epoch": 1.1154483084930211, "grad_norm": 2.7463855743408203, "learning_rate": 1.97984990231838e-07, "loss": 0.0716, "step": 9430 }, { "epoch": 1.1166311805062692, "grad_norm": 1.8967071771621704, "learning_rate": 1.9786072927016608e-07, "loss": 0.0744, "step": 9440 }, { "epoch": 1.1178140525195175, "grad_norm": 3.7236270904541016, "learning_rate": 1.9773646830849419e-07, "loss": 0.0778, "step": 9450 }, { "epoch": 1.1189969245327656, "grad_norm": 1.9732025861740112, "learning_rate": 1.976122073468223e-07, "loss": 0.0735, "step": 9460 }, { "epoch": 1.1201797965460136, "grad_norm": 2.5866172313690186, "learning_rate": 1.9748794638515037e-07, "loss": 0.0754, "step": 9470 }, { "epoch": 1.121362668559262, "grad_norm": 3.8165223598480225, "learning_rate": 1.9736368542347848e-07, "loss": 0.0828, "step": 9480 }, { "epoch": 1.12254554057251, "grad_norm": 3.269768714904785, "learning_rate": 1.9723942446180658e-07, "loss": 0.0739, "step": 9490 }, { "epoch": 1.1237284125857583, "grad_norm": 2.7737607955932617, "learning_rate": 1.971151635001347e-07, "loss": 0.0796, "step": 9500 }, { "epoch": 1.1249112845990064, "grad_norm": 2.406536340713501, "learning_rate": 1.969909025384628e-07, "loss": 0.073, "step": 9510 }, { "epoch": 1.1260941566122546, "grad_norm": 3.0000526905059814, "learning_rate": 1.968666415767909e-07, "loss": 0.071, "step": 9520 }, { "epoch": 1.1272770286255027, "grad_norm": 2.7837135791778564, "learning_rate": 1.9674238061511898e-07, "loss": 0.0765, "step": 9530 }, { "epoch": 1.1284599006387508, "grad_norm": 2.0944254398345947, "learning_rate": 1.966181196534471e-07, "loss": 0.0764, "step": 9540 }, { "epoch": 1.129642772651999, "grad_norm": 2.090421438217163, "learning_rate": 1.9649385869177517e-07, "loss": 0.0731, "step": 9550 }, { "epoch": 1.1308256446652472, "grad_norm": 2.753727912902832, "learning_rate": 1.9636959773010328e-07, "loss": 0.0712, "step": 9560 }, { "epoch": 1.1320085166784954, "grad_norm": 2.763155221939087, "learning_rate": 1.9624533676843138e-07, "loss": 0.0702, "step": 9570 }, { "epoch": 1.1331913886917435, "grad_norm": 2.517596960067749, "learning_rate": 1.9612107580675952e-07, "loss": 0.0769, "step": 9580 }, { "epoch": 1.1343742607049918, "grad_norm": 2.1710727214813232, "learning_rate": 1.959968148450876e-07, "loss": 0.0687, "step": 9590 }, { "epoch": 1.1355571327182399, "grad_norm": 3.195674419403076, "learning_rate": 1.958725538834157e-07, "loss": 0.079, "step": 9600 }, { "epoch": 1.1367400047314882, "grad_norm": 2.5745961666107178, "learning_rate": 1.9574829292174378e-07, "loss": 0.0741, "step": 9610 }, { "epoch": 1.1379228767447362, "grad_norm": 2.4509332180023193, "learning_rate": 1.956240319600719e-07, "loss": 0.0784, "step": 9620 }, { "epoch": 1.1391057487579843, "grad_norm": 3.579946279525757, "learning_rate": 1.954997709984e-07, "loss": 0.0777, "step": 9630 }, { "epoch": 1.1402886207712326, "grad_norm": 1.7113929986953735, "learning_rate": 1.9537551003672807e-07, "loss": 0.0807, "step": 9640 }, { "epoch": 1.1414714927844807, "grad_norm": 2.124246597290039, "learning_rate": 1.9525124907505618e-07, "loss": 0.0738, "step": 9650 }, { "epoch": 1.142654364797729, "grad_norm": 3.095815658569336, "learning_rate": 1.9512698811338431e-07, "loss": 0.0837, "step": 9660 }, { "epoch": 1.143837236810977, "grad_norm": 2.5926098823547363, "learning_rate": 1.950027271517124e-07, "loss": 0.0785, "step": 9670 }, { "epoch": 1.1450201088242253, "grad_norm": 2.347762107849121, "learning_rate": 1.948784661900405e-07, "loss": 0.0738, "step": 9680 }, { "epoch": 1.1462029808374734, "grad_norm": 3.398646831512451, "learning_rate": 1.947542052283686e-07, "loss": 0.0738, "step": 9690 }, { "epoch": 1.1473858528507215, "grad_norm": 2.3503127098083496, "learning_rate": 1.9462994426669669e-07, "loss": 0.0796, "step": 9700 }, { "epoch": 1.1485687248639698, "grad_norm": 2.8921401500701904, "learning_rate": 1.945056833050248e-07, "loss": 0.0792, "step": 9710 }, { "epoch": 1.1497515968772178, "grad_norm": 2.749664783477783, "learning_rate": 1.9438142234335287e-07, "loss": 0.0676, "step": 9720 }, { "epoch": 1.1509344688904661, "grad_norm": 2.9630658626556396, "learning_rate": 1.9425716138168098e-07, "loss": 0.0751, "step": 9730 }, { "epoch": 1.1521173409037142, "grad_norm": 2.5618436336517334, "learning_rate": 1.941329004200091e-07, "loss": 0.0706, "step": 9740 }, { "epoch": 1.1533002129169625, "grad_norm": 2.9241514205932617, "learning_rate": 1.940086394583372e-07, "loss": 0.0798, "step": 9750 }, { "epoch": 1.1544830849302106, "grad_norm": 1.9625474214553833, "learning_rate": 1.938843784966653e-07, "loss": 0.0687, "step": 9760 }, { "epoch": 1.1556659569434586, "grad_norm": 2.455737352371216, "learning_rate": 1.937601175349934e-07, "loss": 0.0748, "step": 9770 }, { "epoch": 1.156848828956707, "grad_norm": 3.1385653018951416, "learning_rate": 1.9363585657332148e-07, "loss": 0.0695, "step": 9780 }, { "epoch": 1.158031700969955, "grad_norm": 2.253441333770752, "learning_rate": 1.935115956116496e-07, "loss": 0.0745, "step": 9790 }, { "epoch": 1.1592145729832033, "grad_norm": 2.0933141708374023, "learning_rate": 1.933873346499777e-07, "loss": 0.0791, "step": 9800 }, { "epoch": 1.1603974449964514, "grad_norm": 3.1684372425079346, "learning_rate": 1.9326307368830578e-07, "loss": 0.074, "step": 9810 }, { "epoch": 1.1615803170096997, "grad_norm": 3.5070841312408447, "learning_rate": 1.9313881272663388e-07, "loss": 0.0769, "step": 9820 }, { "epoch": 1.1627631890229477, "grad_norm": 3.185100793838501, "learning_rate": 1.9301455176496202e-07, "loss": 0.0742, "step": 9830 }, { "epoch": 1.1639460610361958, "grad_norm": 1.8848037719726562, "learning_rate": 1.928902908032901e-07, "loss": 0.0707, "step": 9840 }, { "epoch": 1.165128933049444, "grad_norm": 2.7763640880584717, "learning_rate": 1.927660298416182e-07, "loss": 0.0792, "step": 9850 }, { "epoch": 1.1663118050626922, "grad_norm": 1.7983925342559814, "learning_rate": 1.9264176887994628e-07, "loss": 0.0743, "step": 9860 }, { "epoch": 1.1674946770759405, "grad_norm": 1.697885513305664, "learning_rate": 1.925175079182744e-07, "loss": 0.0744, "step": 9870 }, { "epoch": 1.1686775490891885, "grad_norm": 2.2275242805480957, "learning_rate": 1.923932469566025e-07, "loss": 0.0753, "step": 9880 }, { "epoch": 1.1698604211024368, "grad_norm": 1.8271421194076538, "learning_rate": 1.9226898599493058e-07, "loss": 0.0724, "step": 9890 }, { "epoch": 1.171043293115685, "grad_norm": 2.5891783237457275, "learning_rate": 1.9214472503325868e-07, "loss": 0.0711, "step": 9900 }, { "epoch": 1.172226165128933, "grad_norm": 2.3743555545806885, "learning_rate": 1.9202046407158681e-07, "loss": 0.0757, "step": 9910 }, { "epoch": 1.1734090371421813, "grad_norm": 2.593397378921509, "learning_rate": 1.918962031099149e-07, "loss": 0.074, "step": 9920 }, { "epoch": 1.1745919091554293, "grad_norm": 3.7376866340637207, "learning_rate": 1.91771942148243e-07, "loss": 0.0771, "step": 9930 }, { "epoch": 1.1757747811686776, "grad_norm": 3.2411859035491943, "learning_rate": 1.916476811865711e-07, "loss": 0.0694, "step": 9940 }, { "epoch": 1.1769576531819257, "grad_norm": 2.376613140106201, "learning_rate": 1.915234202248992e-07, "loss": 0.084, "step": 9950 }, { "epoch": 1.178140525195174, "grad_norm": 2.492509603500366, "learning_rate": 1.913991592632273e-07, "loss": 0.0716, "step": 9960 }, { "epoch": 1.179323397208422, "grad_norm": 3.933540105819702, "learning_rate": 1.9127489830155537e-07, "loss": 0.0771, "step": 9970 }, { "epoch": 1.1805062692216701, "grad_norm": 3.509897232055664, "learning_rate": 1.9115063733988348e-07, "loss": 0.0823, "step": 9980 }, { "epoch": 1.1816891412349184, "grad_norm": 2.291003942489624, "learning_rate": 1.910263763782116e-07, "loss": 0.0818, "step": 9990 }, { "epoch": 1.1828720132481665, "grad_norm": 2.4255807399749756, "learning_rate": 1.9090211541653972e-07, "loss": 0.0766, "step": 10000 }, { "epoch": 1.1840548852614148, "grad_norm": 2.3629634380340576, "learning_rate": 1.907778544548678e-07, "loss": 0.0676, "step": 10010 }, { "epoch": 1.1852377572746629, "grad_norm": 2.9983327388763428, "learning_rate": 1.906535934931959e-07, "loss": 0.0722, "step": 10020 }, { "epoch": 1.1864206292879111, "grad_norm": 2.647615671157837, "learning_rate": 1.9052933253152399e-07, "loss": 0.0609, "step": 10030 }, { "epoch": 1.1876035013011592, "grad_norm": 2.060760498046875, "learning_rate": 1.904050715698521e-07, "loss": 0.0782, "step": 10040 }, { "epoch": 1.1887863733144073, "grad_norm": 2.87583589553833, "learning_rate": 1.902808106081802e-07, "loss": 0.0807, "step": 10050 }, { "epoch": 1.1899692453276556, "grad_norm": 2.5189194679260254, "learning_rate": 1.9015654964650828e-07, "loss": 0.0755, "step": 10060 }, { "epoch": 1.1911521173409036, "grad_norm": 2.739980697631836, "learning_rate": 1.900322886848364e-07, "loss": 0.074, "step": 10070 }, { "epoch": 1.192334989354152, "grad_norm": 2.1514699459075928, "learning_rate": 1.8990802772316452e-07, "loss": 0.0691, "step": 10080 }, { "epoch": 1.1935178613674, "grad_norm": 4.030481815338135, "learning_rate": 1.897837667614926e-07, "loss": 0.0767, "step": 10090 }, { "epoch": 1.1947007333806483, "grad_norm": 1.9449803829193115, "learning_rate": 1.896595057998207e-07, "loss": 0.0799, "step": 10100 }, { "epoch": 1.1958836053938964, "grad_norm": 2.5487143993377686, "learning_rate": 1.895352448381488e-07, "loss": 0.0791, "step": 10110 }, { "epoch": 1.1970664774071444, "grad_norm": 3.0360636711120605, "learning_rate": 1.894109838764769e-07, "loss": 0.0755, "step": 10120 }, { "epoch": 1.1982493494203927, "grad_norm": 2.5314388275146484, "learning_rate": 1.89286722914805e-07, "loss": 0.081, "step": 10130 }, { "epoch": 1.1994322214336408, "grad_norm": 2.2630794048309326, "learning_rate": 1.8916246195313308e-07, "loss": 0.0714, "step": 10140 }, { "epoch": 1.200615093446889, "grad_norm": 2.800778865814209, "learning_rate": 1.890382009914612e-07, "loss": 0.0667, "step": 10150 }, { "epoch": 1.2017979654601372, "grad_norm": 3.027975082397461, "learning_rate": 1.8891394002978932e-07, "loss": 0.0706, "step": 10160 }, { "epoch": 1.2029808374733855, "grad_norm": 2.988739252090454, "learning_rate": 1.887896790681174e-07, "loss": 0.0746, "step": 10170 }, { "epoch": 1.2041637094866335, "grad_norm": 3.0994184017181396, "learning_rate": 1.886654181064455e-07, "loss": 0.0732, "step": 10180 }, { "epoch": 1.2053465814998816, "grad_norm": 2.8147215843200684, "learning_rate": 1.885411571447736e-07, "loss": 0.0753, "step": 10190 }, { "epoch": 1.20652945351313, "grad_norm": 2.986738681793213, "learning_rate": 1.884168961831017e-07, "loss": 0.0752, "step": 10200 }, { "epoch": 1.207712325526378, "grad_norm": 2.363673448562622, "learning_rate": 1.882926352214298e-07, "loss": 0.0737, "step": 10210 }, { "epoch": 1.2088951975396263, "grad_norm": 3.1147356033325195, "learning_rate": 1.881683742597579e-07, "loss": 0.0736, "step": 10220 }, { "epoch": 1.2100780695528743, "grad_norm": 2.4080238342285156, "learning_rate": 1.8804411329808598e-07, "loss": 0.0659, "step": 10230 }, { "epoch": 1.2112609415661226, "grad_norm": 2.189600944519043, "learning_rate": 1.8791985233641411e-07, "loss": 0.0683, "step": 10240 }, { "epoch": 1.2124438135793707, "grad_norm": 2.497997999191284, "learning_rate": 1.8779559137474222e-07, "loss": 0.0843, "step": 10250 }, { "epoch": 1.2136266855926188, "grad_norm": 3.0080974102020264, "learning_rate": 1.876713304130703e-07, "loss": 0.0713, "step": 10260 }, { "epoch": 1.214809557605867, "grad_norm": 2.837324380874634, "learning_rate": 1.875470694513984e-07, "loss": 0.064, "step": 10270 }, { "epoch": 1.2159924296191151, "grad_norm": 2.4607927799224854, "learning_rate": 1.8742280848972649e-07, "loss": 0.0789, "step": 10280 }, { "epoch": 1.2171753016323634, "grad_norm": 2.226661443710327, "learning_rate": 1.872985475280546e-07, "loss": 0.082, "step": 10290 }, { "epoch": 1.2183581736456115, "grad_norm": 2.6015899181365967, "learning_rate": 1.871742865663827e-07, "loss": 0.0846, "step": 10300 }, { "epoch": 1.2195410456588598, "grad_norm": 3.5614967346191406, "learning_rate": 1.8705002560471078e-07, "loss": 0.0841, "step": 10310 }, { "epoch": 1.2207239176721079, "grad_norm": 2.327575206756592, "learning_rate": 1.869257646430389e-07, "loss": 0.0753, "step": 10320 }, { "epoch": 1.221906789685356, "grad_norm": 2.732860803604126, "learning_rate": 1.8680150368136702e-07, "loss": 0.0756, "step": 10330 }, { "epoch": 1.2230896616986042, "grad_norm": 2.178374767303467, "learning_rate": 1.866772427196951e-07, "loss": 0.0756, "step": 10340 }, { "epoch": 1.2242725337118523, "grad_norm": 2.6574437618255615, "learning_rate": 1.865529817580232e-07, "loss": 0.0868, "step": 10350 }, { "epoch": 1.2254554057251006, "grad_norm": 2.589495897293091, "learning_rate": 1.864287207963513e-07, "loss": 0.0745, "step": 10360 }, { "epoch": 1.2266382777383487, "grad_norm": 2.842524290084839, "learning_rate": 1.863044598346794e-07, "loss": 0.0741, "step": 10370 }, { "epoch": 1.227821149751597, "grad_norm": 2.806434154510498, "learning_rate": 1.861801988730075e-07, "loss": 0.0751, "step": 10380 }, { "epoch": 1.229004021764845, "grad_norm": 2.965468406677246, "learning_rate": 1.8605593791133558e-07, "loss": 0.0798, "step": 10390 }, { "epoch": 1.230186893778093, "grad_norm": 3.477609634399414, "learning_rate": 1.859316769496637e-07, "loss": 0.0823, "step": 10400 }, { "epoch": 1.2313697657913414, "grad_norm": 2.2664198875427246, "learning_rate": 1.8580741598799182e-07, "loss": 0.0788, "step": 10410 }, { "epoch": 1.2325526378045895, "grad_norm": 2.283999443054199, "learning_rate": 1.8568315502631992e-07, "loss": 0.0741, "step": 10420 }, { "epoch": 1.2337355098178377, "grad_norm": 1.9421151876449585, "learning_rate": 1.85558894064648e-07, "loss": 0.0661, "step": 10430 }, { "epoch": 1.2349183818310858, "grad_norm": 1.8145650625228882, "learning_rate": 1.854346331029761e-07, "loss": 0.0747, "step": 10440 }, { "epoch": 1.2361012538443341, "grad_norm": 2.775660991668701, "learning_rate": 1.853103721413042e-07, "loss": 0.0709, "step": 10450 }, { "epoch": 1.2372841258575822, "grad_norm": 2.503087043762207, "learning_rate": 1.851861111796323e-07, "loss": 0.085, "step": 10460 }, { "epoch": 1.2384669978708303, "grad_norm": 2.566084146499634, "learning_rate": 1.850618502179604e-07, "loss": 0.0767, "step": 10470 }, { "epoch": 1.2396498698840785, "grad_norm": 1.9818662405014038, "learning_rate": 1.849375892562885e-07, "loss": 0.08, "step": 10480 }, { "epoch": 1.2408327418973266, "grad_norm": 2.779775619506836, "learning_rate": 1.8481332829461661e-07, "loss": 0.0761, "step": 10490 }, { "epoch": 1.242015613910575, "grad_norm": 2.4932188987731934, "learning_rate": 1.8468906733294472e-07, "loss": 0.0753, "step": 10500 }, { "epoch": 1.243198485923823, "grad_norm": 2.9882545471191406, "learning_rate": 1.845648063712728e-07, "loss": 0.0843, "step": 10510 }, { "epoch": 1.2443813579370713, "grad_norm": 2.1128604412078857, "learning_rate": 1.844405454096009e-07, "loss": 0.0756, "step": 10520 }, { "epoch": 1.2455642299503193, "grad_norm": 2.9303808212280273, "learning_rate": 1.8431628444792901e-07, "loss": 0.082, "step": 10530 }, { "epoch": 1.2467471019635676, "grad_norm": 2.034457206726074, "learning_rate": 1.841920234862571e-07, "loss": 0.071, "step": 10540 }, { "epoch": 1.2479299739768157, "grad_norm": 3.8156981468200684, "learning_rate": 1.840677625245852e-07, "loss": 0.0831, "step": 10550 }, { "epoch": 1.2491128459900638, "grad_norm": 2.5859110355377197, "learning_rate": 1.8394350156291333e-07, "loss": 0.0767, "step": 10560 }, { "epoch": 1.250295718003312, "grad_norm": 3.9396347999572754, "learning_rate": 1.838192406012414e-07, "loss": 0.0801, "step": 10570 }, { "epoch": 1.2514785900165601, "grad_norm": 2.3333253860473633, "learning_rate": 1.8369497963956952e-07, "loss": 0.0712, "step": 10580 }, { "epoch": 1.2526614620298084, "grad_norm": 2.51991605758667, "learning_rate": 1.835707186778976e-07, "loss": 0.0807, "step": 10590 }, { "epoch": 1.2538443340430565, "grad_norm": 2.647477626800537, "learning_rate": 1.834464577162257e-07, "loss": 0.0758, "step": 10600 }, { "epoch": 1.2550272060563046, "grad_norm": 2.24906325340271, "learning_rate": 1.833221967545538e-07, "loss": 0.0721, "step": 10610 }, { "epoch": 1.2562100780695529, "grad_norm": 2.1972873210906982, "learning_rate": 1.831979357928819e-07, "loss": 0.0742, "step": 10620 }, { "epoch": 1.2573929500828012, "grad_norm": 2.4790585041046143, "learning_rate": 1.8307367483121e-07, "loss": 0.0672, "step": 10630 }, { "epoch": 1.2585758220960492, "grad_norm": 2.2904605865478516, "learning_rate": 1.829494138695381e-07, "loss": 0.0723, "step": 10640 }, { "epoch": 1.2597586941092973, "grad_norm": 2.525190591812134, "learning_rate": 1.828251529078662e-07, "loss": 0.0765, "step": 10650 }, { "epoch": 1.2609415661225456, "grad_norm": 2.5790398120880127, "learning_rate": 1.8270089194619432e-07, "loss": 0.074, "step": 10660 }, { "epoch": 1.2621244381357937, "grad_norm": 2.5196616649627686, "learning_rate": 1.8257663098452242e-07, "loss": 0.0753, "step": 10670 }, { "epoch": 1.2633073101490417, "grad_norm": 2.0713388919830322, "learning_rate": 1.824523700228505e-07, "loss": 0.0696, "step": 10680 }, { "epoch": 1.26449018216229, "grad_norm": 2.5578272342681885, "learning_rate": 1.823281090611786e-07, "loss": 0.0771, "step": 10690 }, { "epoch": 1.2656730541755383, "grad_norm": 2.3968288898468018, "learning_rate": 1.822038480995067e-07, "loss": 0.0754, "step": 10700 }, { "epoch": 1.2668559261887864, "grad_norm": 3.188706636428833, "learning_rate": 1.820795871378348e-07, "loss": 0.0709, "step": 10710 }, { "epoch": 1.2680387982020345, "grad_norm": 2.7589690685272217, "learning_rate": 1.819553261761629e-07, "loss": 0.0754, "step": 10720 }, { "epoch": 1.2692216702152828, "grad_norm": 3.02348256111145, "learning_rate": 1.8183106521449104e-07, "loss": 0.0679, "step": 10730 }, { "epoch": 1.2704045422285308, "grad_norm": 3.7256217002868652, "learning_rate": 1.8170680425281912e-07, "loss": 0.0768, "step": 10740 }, { "epoch": 1.271587414241779, "grad_norm": 2.766768217086792, "learning_rate": 1.8158254329114722e-07, "loss": 0.075, "step": 10750 }, { "epoch": 1.2727702862550272, "grad_norm": 2.745911121368408, "learning_rate": 1.814582823294753e-07, "loss": 0.0709, "step": 10760 }, { "epoch": 1.2739531582682755, "grad_norm": 3.2030539512634277, "learning_rate": 1.813340213678034e-07, "loss": 0.075, "step": 10770 }, { "epoch": 1.2751360302815236, "grad_norm": 3.1271183490753174, "learning_rate": 1.8120976040613152e-07, "loss": 0.0726, "step": 10780 }, { "epoch": 1.2763189022947716, "grad_norm": 3.2420425415039062, "learning_rate": 1.810854994444596e-07, "loss": 0.0873, "step": 10790 }, { "epoch": 1.27750177430802, "grad_norm": 3.6839377880096436, "learning_rate": 1.809612384827877e-07, "loss": 0.0816, "step": 10800 }, { "epoch": 1.278684646321268, "grad_norm": 2.9440393447875977, "learning_rate": 1.8083697752111583e-07, "loss": 0.0751, "step": 10810 }, { "epoch": 1.2798675183345163, "grad_norm": 2.656099319458008, "learning_rate": 1.8071271655944391e-07, "loss": 0.0715, "step": 10820 }, { "epoch": 1.2810503903477644, "grad_norm": 2.2575294971466064, "learning_rate": 1.8058845559777202e-07, "loss": 0.0616, "step": 10830 }, { "epoch": 1.2822332623610126, "grad_norm": 2.6596298217773438, "learning_rate": 1.8046419463610013e-07, "loss": 0.0738, "step": 10840 }, { "epoch": 1.2834161343742607, "grad_norm": 3.965053081512451, "learning_rate": 1.803399336744282e-07, "loss": 0.0841, "step": 10850 }, { "epoch": 1.2845990063875088, "grad_norm": 3.104052782058716, "learning_rate": 1.8021567271275631e-07, "loss": 0.0765, "step": 10860 }, { "epoch": 1.285781878400757, "grad_norm": 3.060460329055786, "learning_rate": 1.800914117510844e-07, "loss": 0.0704, "step": 10870 }, { "epoch": 1.2869647504140052, "grad_norm": 2.4671404361724854, "learning_rate": 1.799671507894125e-07, "loss": 0.0652, "step": 10880 }, { "epoch": 1.2881476224272534, "grad_norm": 2.072101593017578, "learning_rate": 1.7984288982774063e-07, "loss": 0.0737, "step": 10890 }, { "epoch": 1.2893304944405015, "grad_norm": 2.980739116668701, "learning_rate": 1.797186288660687e-07, "loss": 0.0767, "step": 10900 }, { "epoch": 1.2905133664537498, "grad_norm": 2.754775047302246, "learning_rate": 1.7959436790439682e-07, "loss": 0.0754, "step": 10910 }, { "epoch": 1.2916962384669979, "grad_norm": 3.1310577392578125, "learning_rate": 1.7947010694272493e-07, "loss": 0.0688, "step": 10920 }, { "epoch": 1.292879110480246, "grad_norm": 2.536647319793701, "learning_rate": 1.79345845981053e-07, "loss": 0.0641, "step": 10930 }, { "epoch": 1.2940619824934942, "grad_norm": 2.9957022666931152, "learning_rate": 1.792215850193811e-07, "loss": 0.0784, "step": 10940 }, { "epoch": 1.2952448545067423, "grad_norm": 2.432373046875, "learning_rate": 1.7909732405770922e-07, "loss": 0.0791, "step": 10950 }, { "epoch": 1.2964277265199906, "grad_norm": 2.5608551502227783, "learning_rate": 1.789730630960373e-07, "loss": 0.0743, "step": 10960 }, { "epoch": 1.2976105985332387, "grad_norm": 2.696970224380493, "learning_rate": 1.788488021343654e-07, "loss": 0.0706, "step": 10970 }, { "epoch": 1.298793470546487, "grad_norm": 2.8506977558135986, "learning_rate": 1.7872454117269354e-07, "loss": 0.0743, "step": 10980 }, { "epoch": 1.299976342559735, "grad_norm": 2.6301724910736084, "learning_rate": 1.7860028021102162e-07, "loss": 0.0685, "step": 10990 }, { "epoch": 1.3011592145729831, "grad_norm": 2.2941877841949463, "learning_rate": 1.7847601924934972e-07, "loss": 0.0758, "step": 11000 }, { "epoch": 1.3023420865862314, "grad_norm": 2.669881582260132, "learning_rate": 1.783517582876778e-07, "loss": 0.0732, "step": 11010 }, { "epoch": 1.3035249585994795, "grad_norm": 2.9773051738739014, "learning_rate": 1.782274973260059e-07, "loss": 0.0806, "step": 11020 }, { "epoch": 1.3047078306127278, "grad_norm": 2.851123094558716, "learning_rate": 1.7810323636433402e-07, "loss": 0.0749, "step": 11030 }, { "epoch": 1.3058907026259758, "grad_norm": 2.5020132064819336, "learning_rate": 1.779789754026621e-07, "loss": 0.0777, "step": 11040 }, { "epoch": 1.3070735746392241, "grad_norm": 2.6860289573669434, "learning_rate": 1.778547144409902e-07, "loss": 0.0842, "step": 11050 }, { "epoch": 1.3082564466524722, "grad_norm": 3.1209604740142822, "learning_rate": 1.7773045347931834e-07, "loss": 0.081, "step": 11060 }, { "epoch": 1.3094393186657203, "grad_norm": 2.6659886837005615, "learning_rate": 1.7760619251764642e-07, "loss": 0.0773, "step": 11070 }, { "epoch": 1.3106221906789686, "grad_norm": 2.1433403491973877, "learning_rate": 1.7748193155597452e-07, "loss": 0.0735, "step": 11080 }, { "epoch": 1.3118050626922166, "grad_norm": 2.608316421508789, "learning_rate": 1.7735767059430263e-07, "loss": 0.0798, "step": 11090 }, { "epoch": 1.312987934705465, "grad_norm": 3.2659294605255127, "learning_rate": 1.772334096326307e-07, "loss": 0.0764, "step": 11100 }, { "epoch": 1.314170806718713, "grad_norm": 3.136723041534424, "learning_rate": 1.7710914867095881e-07, "loss": 0.0699, "step": 11110 }, { "epoch": 1.3153536787319613, "grad_norm": 2.6990578174591064, "learning_rate": 1.769848877092869e-07, "loss": 0.0754, "step": 11120 }, { "epoch": 1.3165365507452094, "grad_norm": 2.442011594772339, "learning_rate": 1.76860626747615e-07, "loss": 0.0767, "step": 11130 }, { "epoch": 1.3177194227584574, "grad_norm": 2.2631821632385254, "learning_rate": 1.7673636578594313e-07, "loss": 0.0741, "step": 11140 }, { "epoch": 1.3189022947717057, "grad_norm": 1.9379740953445435, "learning_rate": 1.7661210482427124e-07, "loss": 0.0743, "step": 11150 }, { "epoch": 1.3200851667849538, "grad_norm": 2.7883801460266113, "learning_rate": 1.7648784386259932e-07, "loss": 0.0793, "step": 11160 }, { "epoch": 1.321268038798202, "grad_norm": 2.3383901119232178, "learning_rate": 1.7636358290092743e-07, "loss": 0.0757, "step": 11170 }, { "epoch": 1.3224509108114502, "grad_norm": 2.562122344970703, "learning_rate": 1.762393219392555e-07, "loss": 0.0798, "step": 11180 }, { "epoch": 1.3236337828246985, "grad_norm": 1.9497108459472656, "learning_rate": 1.761150609775836e-07, "loss": 0.0713, "step": 11190 }, { "epoch": 1.3248166548379465, "grad_norm": 2.7113165855407715, "learning_rate": 1.7599080001591172e-07, "loss": 0.0772, "step": 11200 }, { "epoch": 1.3259995268511946, "grad_norm": 2.4533438682556152, "learning_rate": 1.758665390542398e-07, "loss": 0.072, "step": 11210 }, { "epoch": 1.327182398864443, "grad_norm": 2.6708974838256836, "learning_rate": 1.7574227809256793e-07, "loss": 0.0813, "step": 11220 }, { "epoch": 1.328365270877691, "grad_norm": 2.907667636871338, "learning_rate": 1.7561801713089604e-07, "loss": 0.0719, "step": 11230 }, { "epoch": 1.3295481428909393, "grad_norm": 2.6713714599609375, "learning_rate": 1.7549375616922412e-07, "loss": 0.0757, "step": 11240 }, { "epoch": 1.3307310149041873, "grad_norm": 3.1299057006835938, "learning_rate": 1.7536949520755222e-07, "loss": 0.0788, "step": 11250 }, { "epoch": 1.3319138869174356, "grad_norm": 2.2319180965423584, "learning_rate": 1.7524523424588033e-07, "loss": 0.0733, "step": 11260 }, { "epoch": 1.3330967589306837, "grad_norm": 2.356633424758911, "learning_rate": 1.751209732842084e-07, "loss": 0.0745, "step": 11270 }, { "epoch": 1.3342796309439318, "grad_norm": 3.3213579654693604, "learning_rate": 1.7499671232253652e-07, "loss": 0.0823, "step": 11280 }, { "epoch": 1.33546250295718, "grad_norm": 3.7159550189971924, "learning_rate": 1.748724513608646e-07, "loss": 0.0789, "step": 11290 }, { "epoch": 1.3366453749704281, "grad_norm": 2.3413267135620117, "learning_rate": 1.7474819039919273e-07, "loss": 0.0747, "step": 11300 }, { "epoch": 1.3378282469836764, "grad_norm": 2.1209421157836914, "learning_rate": 1.7462392943752084e-07, "loss": 0.0754, "step": 11310 }, { "epoch": 1.3390111189969245, "grad_norm": 3.0857598781585693, "learning_rate": 1.7449966847584892e-07, "loss": 0.0825, "step": 11320 }, { "epoch": 1.3401939910101728, "grad_norm": 2.7139978408813477, "learning_rate": 1.7437540751417702e-07, "loss": 0.0729, "step": 11330 }, { "epoch": 1.3413768630234209, "grad_norm": 2.266934394836426, "learning_rate": 1.7425114655250513e-07, "loss": 0.0774, "step": 11340 }, { "epoch": 1.342559735036669, "grad_norm": 3.2305619716644287, "learning_rate": 1.741268855908332e-07, "loss": 0.0754, "step": 11350 }, { "epoch": 1.3437426070499172, "grad_norm": 2.8088035583496094, "learning_rate": 1.7400262462916132e-07, "loss": 0.0821, "step": 11360 }, { "epoch": 1.3449254790631653, "grad_norm": 3.3630781173706055, "learning_rate": 1.7387836366748942e-07, "loss": 0.0864, "step": 11370 }, { "epoch": 1.3461083510764136, "grad_norm": 4.448792457580566, "learning_rate": 1.737541027058175e-07, "loss": 0.0723, "step": 11380 }, { "epoch": 1.3472912230896616, "grad_norm": 2.7885677814483643, "learning_rate": 1.7362984174414563e-07, "loss": 0.0728, "step": 11390 }, { "epoch": 1.34847409510291, "grad_norm": 2.816509485244751, "learning_rate": 1.7350558078247374e-07, "loss": 0.0846, "step": 11400 }, { "epoch": 1.349656967116158, "grad_norm": 2.693006992340088, "learning_rate": 1.7338131982080182e-07, "loss": 0.0659, "step": 11410 }, { "epoch": 1.350839839129406, "grad_norm": 2.747535228729248, "learning_rate": 1.7325705885912993e-07, "loss": 0.0795, "step": 11420 }, { "epoch": 1.3520227111426544, "grad_norm": 2.93385910987854, "learning_rate": 1.73132797897458e-07, "loss": 0.0696, "step": 11430 }, { "epoch": 1.3532055831559027, "grad_norm": 1.5259615182876587, "learning_rate": 1.7300853693578611e-07, "loss": 0.0716, "step": 11440 }, { "epoch": 1.3543884551691507, "grad_norm": 3.025064468383789, "learning_rate": 1.7288427597411422e-07, "loss": 0.0865, "step": 11450 }, { "epoch": 1.3555713271823988, "grad_norm": 2.5986828804016113, "learning_rate": 1.727600150124423e-07, "loss": 0.0704, "step": 11460 }, { "epoch": 1.356754199195647, "grad_norm": 2.9241442680358887, "learning_rate": 1.7263575405077043e-07, "loss": 0.0721, "step": 11470 }, { "epoch": 1.3579370712088952, "grad_norm": 3.558199644088745, "learning_rate": 1.7251149308909854e-07, "loss": 0.0758, "step": 11480 }, { "epoch": 1.3591199432221432, "grad_norm": 2.718388557434082, "learning_rate": 1.7238723212742662e-07, "loss": 0.083, "step": 11490 }, { "epoch": 1.3603028152353915, "grad_norm": 2.279693365097046, "learning_rate": 1.7226297116575473e-07, "loss": 0.0821, "step": 11500 }, { "epoch": 1.3614856872486398, "grad_norm": 2.612525701522827, "learning_rate": 1.7213871020408283e-07, "loss": 0.082, "step": 11510 }, { "epoch": 1.362668559261888, "grad_norm": 2.72381854057312, "learning_rate": 1.720144492424109e-07, "loss": 0.0747, "step": 11520 }, { "epoch": 1.363851431275136, "grad_norm": 3.7314040660858154, "learning_rate": 1.7189018828073902e-07, "loss": 0.0704, "step": 11530 }, { "epoch": 1.3650343032883843, "grad_norm": 2.396562337875366, "learning_rate": 1.717659273190671e-07, "loss": 0.0715, "step": 11540 }, { "epoch": 1.3662171753016323, "grad_norm": 2.735402822494507, "learning_rate": 1.7164166635739523e-07, "loss": 0.0793, "step": 11550 }, { "epoch": 1.3674000473148804, "grad_norm": 2.487879514694214, "learning_rate": 1.7151740539572334e-07, "loss": 0.0777, "step": 11560 }, { "epoch": 1.3685829193281287, "grad_norm": 2.569244384765625, "learning_rate": 1.7139314443405144e-07, "loss": 0.0786, "step": 11570 }, { "epoch": 1.369765791341377, "grad_norm": 2.367499351501465, "learning_rate": 1.7126888347237952e-07, "loss": 0.0805, "step": 11580 }, { "epoch": 1.370948663354625, "grad_norm": 2.4436962604522705, "learning_rate": 1.7114462251070763e-07, "loss": 0.0748, "step": 11590 }, { "epoch": 1.3721315353678731, "grad_norm": 2.7935268878936768, "learning_rate": 1.710203615490357e-07, "loss": 0.0757, "step": 11600 }, { "epoch": 1.3733144073811214, "grad_norm": 2.638911247253418, "learning_rate": 1.7089610058736382e-07, "loss": 0.0776, "step": 11610 }, { "epoch": 1.3744972793943695, "grad_norm": 2.391378164291382, "learning_rate": 1.7077183962569192e-07, "loss": 0.0777, "step": 11620 }, { "epoch": 1.3756801514076176, "grad_norm": 3.1451971530914307, "learning_rate": 1.7064757866402003e-07, "loss": 0.0707, "step": 11630 }, { "epoch": 1.3768630234208659, "grad_norm": 2.077181816101074, "learning_rate": 1.7052331770234814e-07, "loss": 0.0767, "step": 11640 }, { "epoch": 1.3780458954341142, "grad_norm": 2.781158208847046, "learning_rate": 1.7039905674067624e-07, "loss": 0.077, "step": 11650 }, { "epoch": 1.3792287674473622, "grad_norm": 3.037055730819702, "learning_rate": 1.7027479577900432e-07, "loss": 0.0779, "step": 11660 }, { "epoch": 1.3804116394606103, "grad_norm": 3.7183821201324463, "learning_rate": 1.7015053481733243e-07, "loss": 0.0864, "step": 11670 }, { "epoch": 1.3815945114738586, "grad_norm": 2.7346978187561035, "learning_rate": 1.7002627385566053e-07, "loss": 0.0844, "step": 11680 }, { "epoch": 1.3827773834871067, "grad_norm": 2.430260181427002, "learning_rate": 1.6990201289398861e-07, "loss": 0.0783, "step": 11690 }, { "epoch": 1.3839602555003547, "grad_norm": 2.909252643585205, "learning_rate": 1.6977775193231672e-07, "loss": 0.0908, "step": 11700 }, { "epoch": 1.385143127513603, "grad_norm": 2.7536327838897705, "learning_rate": 1.6965349097064485e-07, "loss": 0.0722, "step": 11710 }, { "epoch": 1.3863259995268513, "grad_norm": 2.7391438484191895, "learning_rate": 1.6952923000897293e-07, "loss": 0.0827, "step": 11720 }, { "epoch": 1.3875088715400994, "grad_norm": 3.1197941303253174, "learning_rate": 1.6940496904730104e-07, "loss": 0.0791, "step": 11730 }, { "epoch": 1.3886917435533475, "grad_norm": 2.187282085418701, "learning_rate": 1.6928070808562912e-07, "loss": 0.0732, "step": 11740 }, { "epoch": 1.3898746155665958, "grad_norm": 2.2743356227874756, "learning_rate": 1.6915644712395723e-07, "loss": 0.0685, "step": 11750 }, { "epoch": 1.3910574875798438, "grad_norm": 2.7519750595092773, "learning_rate": 1.6903218616228533e-07, "loss": 0.0699, "step": 11760 }, { "epoch": 1.392240359593092, "grad_norm": 2.1377408504486084, "learning_rate": 1.689079252006134e-07, "loss": 0.0778, "step": 11770 }, { "epoch": 1.3934232316063402, "grad_norm": 2.5670149326324463, "learning_rate": 1.6878366423894152e-07, "loss": 0.0749, "step": 11780 }, { "epoch": 1.3946061036195885, "grad_norm": 1.9924920797348022, "learning_rate": 1.6865940327726963e-07, "loss": 0.0756, "step": 11790 }, { "epoch": 1.3957889756328365, "grad_norm": 2.7001779079437256, "learning_rate": 1.6853514231559773e-07, "loss": 0.0756, "step": 11800 }, { "epoch": 1.3969718476460846, "grad_norm": 2.2047815322875977, "learning_rate": 1.6841088135392584e-07, "loss": 0.0796, "step": 11810 }, { "epoch": 1.398154719659333, "grad_norm": 2.795447826385498, "learning_rate": 1.6828662039225394e-07, "loss": 0.0767, "step": 11820 }, { "epoch": 1.399337591672581, "grad_norm": 2.6641666889190674, "learning_rate": 1.6816235943058202e-07, "loss": 0.0661, "step": 11830 }, { "epoch": 1.4005204636858293, "grad_norm": 2.4889347553253174, "learning_rate": 1.6803809846891013e-07, "loss": 0.0792, "step": 11840 }, { "epoch": 1.4017033356990773, "grad_norm": 2.8332438468933105, "learning_rate": 1.679138375072382e-07, "loss": 0.0767, "step": 11850 }, { "epoch": 1.4028862077123256, "grad_norm": 2.8409509658813477, "learning_rate": 1.6778957654556632e-07, "loss": 0.0799, "step": 11860 }, { "epoch": 1.4040690797255737, "grad_norm": 3.240828037261963, "learning_rate": 1.6766531558389442e-07, "loss": 0.0721, "step": 11870 }, { "epoch": 1.4052519517388218, "grad_norm": 2.721203088760376, "learning_rate": 1.6754105462222256e-07, "loss": 0.0748, "step": 11880 }, { "epoch": 1.40643482375207, "grad_norm": 2.431087017059326, "learning_rate": 1.6741679366055064e-07, "loss": 0.0813, "step": 11890 }, { "epoch": 1.4076176957653181, "grad_norm": 2.2055866718292236, "learning_rate": 1.6729253269887874e-07, "loss": 0.0796, "step": 11900 }, { "epoch": 1.4088005677785664, "grad_norm": 3.4415206909179688, "learning_rate": 1.6716827173720682e-07, "loss": 0.0786, "step": 11910 }, { "epoch": 1.4099834397918145, "grad_norm": 3.318373680114746, "learning_rate": 1.6704401077553493e-07, "loss": 0.0809, "step": 11920 }, { "epoch": 1.4111663118050628, "grad_norm": 2.8342413902282715, "learning_rate": 1.6691974981386304e-07, "loss": 0.0811, "step": 11930 }, { "epoch": 1.4123491838183109, "grad_norm": 1.6937507390975952, "learning_rate": 1.6679548885219112e-07, "loss": 0.0753, "step": 11940 }, { "epoch": 1.413532055831559, "grad_norm": 2.8954501152038574, "learning_rate": 1.6667122789051922e-07, "loss": 0.0847, "step": 11950 }, { "epoch": 1.4147149278448072, "grad_norm": 1.957468867301941, "learning_rate": 1.6654696692884735e-07, "loss": 0.0702, "step": 11960 }, { "epoch": 1.4158977998580553, "grad_norm": 2.8397927284240723, "learning_rate": 1.6642270596717543e-07, "loss": 0.0765, "step": 11970 }, { "epoch": 1.4170806718713036, "grad_norm": 2.25258731842041, "learning_rate": 1.6629844500550354e-07, "loss": 0.0783, "step": 11980 }, { "epoch": 1.4182635438845517, "grad_norm": 1.8865984678268433, "learning_rate": 1.6617418404383165e-07, "loss": 0.0822, "step": 11990 }, { "epoch": 1.4194464158978, "grad_norm": 2.5468978881835938, "learning_rate": 1.6604992308215973e-07, "loss": 0.0785, "step": 12000 }, { "epoch": 1.420629287911048, "grad_norm": 3.7654964923858643, "learning_rate": 1.6592566212048783e-07, "loss": 0.0825, "step": 12010 }, { "epoch": 1.421812159924296, "grad_norm": 2.8325819969177246, "learning_rate": 1.6580140115881591e-07, "loss": 0.0725, "step": 12020 }, { "epoch": 1.4229950319375444, "grad_norm": 4.062893390655518, "learning_rate": 1.6567714019714402e-07, "loss": 0.0738, "step": 12030 }, { "epoch": 1.4241779039507925, "grad_norm": 2.9774904251098633, "learning_rate": 1.6555287923547215e-07, "loss": 0.0802, "step": 12040 }, { "epoch": 1.4253607759640408, "grad_norm": 1.971998691558838, "learning_rate": 1.6542861827380023e-07, "loss": 0.0743, "step": 12050 }, { "epoch": 1.4265436479772888, "grad_norm": 2.7451961040496826, "learning_rate": 1.6530435731212834e-07, "loss": 0.0747, "step": 12060 }, { "epoch": 1.4277265199905371, "grad_norm": 3.7760231494903564, "learning_rate": 1.6518009635045645e-07, "loss": 0.0743, "step": 12070 }, { "epoch": 1.4289093920037852, "grad_norm": 2.7121410369873047, "learning_rate": 1.6505583538878453e-07, "loss": 0.0771, "step": 12080 }, { "epoch": 1.4300922640170333, "grad_norm": 2.3564562797546387, "learning_rate": 1.6493157442711263e-07, "loss": 0.0841, "step": 12090 }, { "epoch": 1.4312751360302816, "grad_norm": 2.2318201065063477, "learning_rate": 1.6480731346544074e-07, "loss": 0.0732, "step": 12100 }, { "epoch": 1.4324580080435296, "grad_norm": 2.205026149749756, "learning_rate": 1.6468305250376882e-07, "loss": 0.079, "step": 12110 }, { "epoch": 1.433640880056778, "grad_norm": 2.14477801322937, "learning_rate": 1.6455879154209695e-07, "loss": 0.0737, "step": 12120 }, { "epoch": 1.434823752070026, "grad_norm": 2.4429986476898193, "learning_rate": 1.6443453058042506e-07, "loss": 0.0656, "step": 12130 }, { "epoch": 1.4360066240832743, "grad_norm": 2.9509525299072266, "learning_rate": 1.6431026961875314e-07, "loss": 0.0787, "step": 12140 }, { "epoch": 1.4371894960965224, "grad_norm": 2.637894630432129, "learning_rate": 1.6418600865708124e-07, "loss": 0.0832, "step": 12150 }, { "epoch": 1.4383723681097704, "grad_norm": 2.9412858486175537, "learning_rate": 1.6406174769540932e-07, "loss": 0.0788, "step": 12160 }, { "epoch": 1.4395552401230187, "grad_norm": 3.8428702354431152, "learning_rate": 1.6393748673373743e-07, "loss": 0.0771, "step": 12170 }, { "epoch": 1.4407381121362668, "grad_norm": 2.4573512077331543, "learning_rate": 1.6381322577206554e-07, "loss": 0.0744, "step": 12180 }, { "epoch": 1.441920984149515, "grad_norm": 2.374448299407959, "learning_rate": 1.6368896481039362e-07, "loss": 0.0755, "step": 12190 }, { "epoch": 1.4431038561627632, "grad_norm": 2.659374237060547, "learning_rate": 1.6356470384872172e-07, "loss": 0.0797, "step": 12200 }, { "epoch": 1.4442867281760114, "grad_norm": 2.463613986968994, "learning_rate": 1.6344044288704986e-07, "loss": 0.0802, "step": 12210 }, { "epoch": 1.4454696001892595, "grad_norm": 2.3862240314483643, "learning_rate": 1.6331618192537794e-07, "loss": 0.0738, "step": 12220 }, { "epoch": 1.4466524722025076, "grad_norm": 2.6229922771453857, "learning_rate": 1.6319192096370604e-07, "loss": 0.0692, "step": 12230 }, { "epoch": 1.4478353442157559, "grad_norm": 2.3433449268341064, "learning_rate": 1.6306766000203415e-07, "loss": 0.074, "step": 12240 }, { "epoch": 1.449018216229004, "grad_norm": 2.602471113204956, "learning_rate": 1.6294339904036223e-07, "loss": 0.0773, "step": 12250 }, { "epoch": 1.4502010882422522, "grad_norm": 2.4466707706451416, "learning_rate": 1.6281913807869033e-07, "loss": 0.0735, "step": 12260 }, { "epoch": 1.4513839602555003, "grad_norm": 3.1685616970062256, "learning_rate": 1.6269487711701841e-07, "loss": 0.0735, "step": 12270 }, { "epoch": 1.4525668322687486, "grad_norm": 3.060119867324829, "learning_rate": 1.6257061615534652e-07, "loss": 0.0752, "step": 12280 }, { "epoch": 1.4537497042819967, "grad_norm": 1.8927974700927734, "learning_rate": 1.6244635519367465e-07, "loss": 0.0744, "step": 12290 }, { "epoch": 1.4549325762952448, "grad_norm": 2.5929012298583984, "learning_rate": 1.6232209423200276e-07, "loss": 0.0725, "step": 12300 }, { "epoch": 1.456115448308493, "grad_norm": 2.885387420654297, "learning_rate": 1.6219783327033084e-07, "loss": 0.074, "step": 12310 }, { "epoch": 1.4572983203217411, "grad_norm": 3.64172625541687, "learning_rate": 1.6207357230865895e-07, "loss": 0.0643, "step": 12320 }, { "epoch": 1.4584811923349894, "grad_norm": 2.650055170059204, "learning_rate": 1.6194931134698703e-07, "loss": 0.0776, "step": 12330 }, { "epoch": 1.4596640643482375, "grad_norm": 3.483159303665161, "learning_rate": 1.6182505038531513e-07, "loss": 0.0758, "step": 12340 }, { "epoch": 1.4608469363614858, "grad_norm": 2.9976820945739746, "learning_rate": 1.6170078942364324e-07, "loss": 0.0817, "step": 12350 }, { "epoch": 1.4620298083747338, "grad_norm": 2.9169461727142334, "learning_rate": 1.6157652846197132e-07, "loss": 0.0761, "step": 12360 }, { "epoch": 1.463212680387982, "grad_norm": 2.815990447998047, "learning_rate": 1.6145226750029945e-07, "loss": 0.0804, "step": 12370 }, { "epoch": 1.4643955524012302, "grad_norm": 2.641564130783081, "learning_rate": 1.6132800653862756e-07, "loss": 0.0812, "step": 12380 }, { "epoch": 1.4655784244144783, "grad_norm": 1.9225997924804688, "learning_rate": 1.6120374557695564e-07, "loss": 0.0819, "step": 12390 }, { "epoch": 1.4667612964277266, "grad_norm": 1.8078157901763916, "learning_rate": 1.6107948461528374e-07, "loss": 0.0737, "step": 12400 }, { "epoch": 1.4679441684409746, "grad_norm": 2.9196226596832275, "learning_rate": 1.6095522365361185e-07, "loss": 0.0794, "step": 12410 }, { "epoch": 1.469127040454223, "grad_norm": 2.7919187545776367, "learning_rate": 1.6083096269193993e-07, "loss": 0.0831, "step": 12420 }, { "epoch": 1.470309912467471, "grad_norm": 2.536892890930176, "learning_rate": 1.6070670173026804e-07, "loss": 0.0718, "step": 12430 }, { "epoch": 1.471492784480719, "grad_norm": 2.2548000812530518, "learning_rate": 1.6058244076859612e-07, "loss": 0.0863, "step": 12440 }, { "epoch": 1.4726756564939674, "grad_norm": 2.833282470703125, "learning_rate": 1.6045817980692425e-07, "loss": 0.0813, "step": 12450 }, { "epoch": 1.4738585285072154, "grad_norm": 2.506416082382202, "learning_rate": 1.6033391884525236e-07, "loss": 0.0783, "step": 12460 }, { "epoch": 1.4750414005204637, "grad_norm": 2.017519950866699, "learning_rate": 1.6020965788358044e-07, "loss": 0.0683, "step": 12470 }, { "epoch": 1.4762242725337118, "grad_norm": 2.370495557785034, "learning_rate": 1.6008539692190854e-07, "loss": 0.072, "step": 12480 }, { "epoch": 1.47740714454696, "grad_norm": 2.27455472946167, "learning_rate": 1.5996113596023665e-07, "loss": 0.0686, "step": 12490 }, { "epoch": 1.4785900165602082, "grad_norm": 1.782141923904419, "learning_rate": 1.5983687499856473e-07, "loss": 0.0707, "step": 12500 }, { "epoch": 1.4797728885734562, "grad_norm": 2.677432060241699, "learning_rate": 1.5971261403689284e-07, "loss": 0.0827, "step": 12510 }, { "epoch": 1.4809557605867045, "grad_norm": 2.505136489868164, "learning_rate": 1.5958835307522094e-07, "loss": 0.07, "step": 12520 }, { "epoch": 1.4821386325999528, "grad_norm": 1.8471190929412842, "learning_rate": 1.5946409211354902e-07, "loss": 0.0761, "step": 12530 }, { "epoch": 1.483321504613201, "grad_norm": 2.9113714694976807, "learning_rate": 1.5933983115187716e-07, "loss": 0.0749, "step": 12540 }, { "epoch": 1.484504376626449, "grad_norm": 2.8562185764312744, "learning_rate": 1.5921557019020526e-07, "loss": 0.077, "step": 12550 }, { "epoch": 1.4856872486396973, "grad_norm": 2.36118221282959, "learning_rate": 1.5909130922853334e-07, "loss": 0.0661, "step": 12560 }, { "epoch": 1.4868701206529453, "grad_norm": 2.110677480697632, "learning_rate": 1.5896704826686145e-07, "loss": 0.0713, "step": 12570 }, { "epoch": 1.4880529926661934, "grad_norm": 2.109257698059082, "learning_rate": 1.5884278730518953e-07, "loss": 0.0735, "step": 12580 }, { "epoch": 1.4892358646794417, "grad_norm": 2.7162485122680664, "learning_rate": 1.5871852634351763e-07, "loss": 0.0838, "step": 12590 }, { "epoch": 1.49041873669269, "grad_norm": 3.073014736175537, "learning_rate": 1.5859426538184574e-07, "loss": 0.0783, "step": 12600 }, { "epoch": 1.491601608705938, "grad_norm": 3.7398202419281006, "learning_rate": 1.5847000442017382e-07, "loss": 0.072, "step": 12610 }, { "epoch": 1.4927844807191861, "grad_norm": 2.6274266242980957, "learning_rate": 1.5834574345850195e-07, "loss": 0.0689, "step": 12620 }, { "epoch": 1.4939673527324344, "grad_norm": 2.1175758838653564, "learning_rate": 1.5822148249683006e-07, "loss": 0.0761, "step": 12630 }, { "epoch": 1.4951502247456825, "grad_norm": 3.702685832977295, "learning_rate": 1.5809722153515814e-07, "loss": 0.0737, "step": 12640 }, { "epoch": 1.4963330967589306, "grad_norm": 1.8751152753829956, "learning_rate": 1.5797296057348625e-07, "loss": 0.0725, "step": 12650 }, { "epoch": 1.4975159687721789, "grad_norm": 2.1738619804382324, "learning_rate": 1.5784869961181435e-07, "loss": 0.0734, "step": 12660 }, { "epoch": 1.4986988407854271, "grad_norm": 3.5998575687408447, "learning_rate": 1.5772443865014243e-07, "loss": 0.0762, "step": 12670 }, { "epoch": 1.4998817127986752, "grad_norm": 3.0137252807617188, "learning_rate": 1.5760017768847054e-07, "loss": 0.0766, "step": 12680 }, { "epoch": 1.5010645848119233, "grad_norm": 2.308082342147827, "learning_rate": 1.5747591672679862e-07, "loss": 0.0773, "step": 12690 }, { "epoch": 1.5022474568251716, "grad_norm": 2.3885066509246826, "learning_rate": 1.5735165576512675e-07, "loss": 0.081, "step": 12700 }, { "epoch": 1.5034303288384196, "grad_norm": 2.8867650032043457, "learning_rate": 1.5722739480345486e-07, "loss": 0.0765, "step": 12710 }, { "epoch": 1.5046132008516677, "grad_norm": 2.251765012741089, "learning_rate": 1.5710313384178296e-07, "loss": 0.0697, "step": 12720 }, { "epoch": 1.505796072864916, "grad_norm": 3.3186964988708496, "learning_rate": 1.5697887288011104e-07, "loss": 0.0684, "step": 12730 }, { "epoch": 1.5069789448781643, "grad_norm": 2.5522549152374268, "learning_rate": 1.5685461191843915e-07, "loss": 0.0739, "step": 12740 }, { "epoch": 1.5081618168914124, "grad_norm": 2.2957100868225098, "learning_rate": 1.5673035095676723e-07, "loss": 0.0758, "step": 12750 }, { "epoch": 1.5093446889046604, "grad_norm": 2.7490932941436768, "learning_rate": 1.5660608999509534e-07, "loss": 0.0723, "step": 12760 }, { "epoch": 1.5105275609179087, "grad_norm": 3.0851378440856934, "learning_rate": 1.5648182903342344e-07, "loss": 0.0678, "step": 12770 }, { "epoch": 1.5117104329311568, "grad_norm": 2.8045332431793213, "learning_rate": 1.5635756807175155e-07, "loss": 0.0744, "step": 12780 }, { "epoch": 1.5128933049444049, "grad_norm": 2.630098819732666, "learning_rate": 1.5623330711007966e-07, "loss": 0.0804, "step": 12790 }, { "epoch": 1.5140761769576532, "grad_norm": 1.9100730419158936, "learning_rate": 1.5610904614840776e-07, "loss": 0.0685, "step": 12800 }, { "epoch": 1.5152590489709015, "grad_norm": 2.9593372344970703, "learning_rate": 1.5598478518673584e-07, "loss": 0.0768, "step": 12810 }, { "epoch": 1.5164419209841495, "grad_norm": 3.699347496032715, "learning_rate": 1.5586052422506395e-07, "loss": 0.0725, "step": 12820 }, { "epoch": 1.5176247929973976, "grad_norm": 2.292294979095459, "learning_rate": 1.5573626326339206e-07, "loss": 0.0681, "step": 12830 }, { "epoch": 1.518807665010646, "grad_norm": 2.903193712234497, "learning_rate": 1.5561200230172014e-07, "loss": 0.0789, "step": 12840 }, { "epoch": 1.519990537023894, "grad_norm": 2.1169464588165283, "learning_rate": 1.5548774134004824e-07, "loss": 0.0772, "step": 12850 }, { "epoch": 1.521173409037142, "grad_norm": 2.6150786876678467, "learning_rate": 1.5536348037837637e-07, "loss": 0.075, "step": 12860 }, { "epoch": 1.5223562810503903, "grad_norm": 2.9733974933624268, "learning_rate": 1.5523921941670445e-07, "loss": 0.0726, "step": 12870 }, { "epoch": 1.5235391530636386, "grad_norm": 2.660935163497925, "learning_rate": 1.5511495845503256e-07, "loss": 0.0657, "step": 12880 }, { "epoch": 1.5247220250768867, "grad_norm": 2.2162015438079834, "learning_rate": 1.5499069749336064e-07, "loss": 0.0714, "step": 12890 }, { "epoch": 1.5259048970901348, "grad_norm": 2.9221768379211426, "learning_rate": 1.5486643653168875e-07, "loss": 0.0788, "step": 12900 }, { "epoch": 1.527087769103383, "grad_norm": 4.137559413909912, "learning_rate": 1.5474217557001685e-07, "loss": 0.0773, "step": 12910 }, { "epoch": 1.5282706411166311, "grad_norm": 1.9518332481384277, "learning_rate": 1.5461791460834493e-07, "loss": 0.0844, "step": 12920 }, { "epoch": 1.5294535131298792, "grad_norm": 2.7211835384368896, "learning_rate": 1.5449365364667304e-07, "loss": 0.067, "step": 12930 }, { "epoch": 1.5306363851431275, "grad_norm": 2.351548910140991, "learning_rate": 1.5436939268500115e-07, "loss": 0.0851, "step": 12940 }, { "epoch": 1.5318192571563758, "grad_norm": 2.4513092041015625, "learning_rate": 1.5424513172332925e-07, "loss": 0.0836, "step": 12950 }, { "epoch": 1.5330021291696239, "grad_norm": 2.700648307800293, "learning_rate": 1.5412087076165736e-07, "loss": 0.0697, "step": 12960 }, { "epoch": 1.534185001182872, "grad_norm": 1.8743282556533813, "learning_rate": 1.5399660979998547e-07, "loss": 0.0758, "step": 12970 }, { "epoch": 1.5353678731961202, "grad_norm": 2.5247812271118164, "learning_rate": 1.5387234883831355e-07, "loss": 0.0682, "step": 12980 }, { "epoch": 1.5365507452093683, "grad_norm": 1.8144539594650269, "learning_rate": 1.5374808787664165e-07, "loss": 0.0788, "step": 12990 }, { "epoch": 1.5377336172226164, "grad_norm": 2.4344139099121094, "learning_rate": 1.5362382691496973e-07, "loss": 0.0779, "step": 13000 }, { "epoch": 1.5389164892358647, "grad_norm": 2.52327561378479, "learning_rate": 1.5349956595329784e-07, "loss": 0.073, "step": 13010 }, { "epoch": 1.540099361249113, "grad_norm": 2.1165578365325928, "learning_rate": 1.5337530499162594e-07, "loss": 0.0744, "step": 13020 }, { "epoch": 1.541282233262361, "grad_norm": 2.8333394527435303, "learning_rate": 1.5325104402995408e-07, "loss": 0.0696, "step": 13030 }, { "epoch": 1.542465105275609, "grad_norm": 2.716153383255005, "learning_rate": 1.5312678306828216e-07, "loss": 0.081, "step": 13040 }, { "epoch": 1.5436479772888574, "grad_norm": 2.695774555206299, "learning_rate": 1.5300252210661026e-07, "loss": 0.0778, "step": 13050 }, { "epoch": 1.5448308493021055, "grad_norm": 3.6361284255981445, "learning_rate": 1.5287826114493834e-07, "loss": 0.0747, "step": 13060 }, { "epoch": 1.5460137213153535, "grad_norm": 3.028341293334961, "learning_rate": 1.5275400018326645e-07, "loss": 0.0707, "step": 13070 }, { "epoch": 1.5471965933286018, "grad_norm": 3.3370003700256348, "learning_rate": 1.5262973922159456e-07, "loss": 0.0783, "step": 13080 }, { "epoch": 1.5483794653418501, "grad_norm": 1.9062432050704956, "learning_rate": 1.5250547825992264e-07, "loss": 0.0741, "step": 13090 }, { "epoch": 1.5495623373550982, "grad_norm": 3.4937567710876465, "learning_rate": 1.5238121729825074e-07, "loss": 0.0826, "step": 13100 }, { "epoch": 1.5507452093683463, "grad_norm": 2.9436402320861816, "learning_rate": 1.5225695633657888e-07, "loss": 0.0802, "step": 13110 }, { "epoch": 1.5519280813815945, "grad_norm": 2.650385856628418, "learning_rate": 1.5213269537490696e-07, "loss": 0.0676, "step": 13120 }, { "epoch": 1.5531109533948428, "grad_norm": 2.9563920497894287, "learning_rate": 1.5200843441323506e-07, "loss": 0.0682, "step": 13130 }, { "epoch": 1.5542938254080907, "grad_norm": 1.7382055521011353, "learning_rate": 1.5188417345156317e-07, "loss": 0.0732, "step": 13140 }, { "epoch": 1.555476697421339, "grad_norm": 1.9839683771133423, "learning_rate": 1.5175991248989125e-07, "loss": 0.0705, "step": 13150 }, { "epoch": 1.5566595694345873, "grad_norm": 3.0164709091186523, "learning_rate": 1.5163565152821935e-07, "loss": 0.0704, "step": 13160 }, { "epoch": 1.5578424414478353, "grad_norm": 3.9835290908813477, "learning_rate": 1.5151139056654743e-07, "loss": 0.0745, "step": 13170 }, { "epoch": 1.5590253134610834, "grad_norm": 3.13079571723938, "learning_rate": 1.5138712960487554e-07, "loss": 0.0806, "step": 13180 }, { "epoch": 1.5602081854743317, "grad_norm": 1.9813945293426514, "learning_rate": 1.5126286864320367e-07, "loss": 0.0731, "step": 13190 }, { "epoch": 1.56139105748758, "grad_norm": 2.7245874404907227, "learning_rate": 1.5113860768153175e-07, "loss": 0.0782, "step": 13200 }, { "epoch": 1.5625739295008279, "grad_norm": 1.8934440612792969, "learning_rate": 1.5101434671985986e-07, "loss": 0.0779, "step": 13210 }, { "epoch": 1.5637568015140761, "grad_norm": 3.0599803924560547, "learning_rate": 1.5089008575818797e-07, "loss": 0.0735, "step": 13220 }, { "epoch": 1.5649396735273244, "grad_norm": 2.0927274227142334, "learning_rate": 1.5076582479651605e-07, "loss": 0.0646, "step": 13230 }, { "epoch": 1.5661225455405725, "grad_norm": 2.549832582473755, "learning_rate": 1.5064156383484415e-07, "loss": 0.0755, "step": 13240 }, { "epoch": 1.5673054175538206, "grad_norm": 3.8321454524993896, "learning_rate": 1.5051730287317226e-07, "loss": 0.0775, "step": 13250 }, { "epoch": 1.5684882895670689, "grad_norm": 2.7840991020202637, "learning_rate": 1.5039304191150034e-07, "loss": 0.0688, "step": 13260 }, { "epoch": 1.5696711615803172, "grad_norm": 3.3256499767303467, "learning_rate": 1.5026878094982847e-07, "loss": 0.08, "step": 13270 }, { "epoch": 1.570854033593565, "grad_norm": 2.833026885986328, "learning_rate": 1.5014451998815658e-07, "loss": 0.0778, "step": 13280 }, { "epoch": 1.5720369056068133, "grad_norm": 2.5727415084838867, "learning_rate": 1.5002025902648466e-07, "loss": 0.0781, "step": 13290 }, { "epoch": 1.5732197776200616, "grad_norm": 2.5702710151672363, "learning_rate": 1.4989599806481276e-07, "loss": 0.0791, "step": 13300 }, { "epoch": 1.5744026496333097, "grad_norm": 2.1258091926574707, "learning_rate": 1.4977173710314084e-07, "loss": 0.0772, "step": 13310 }, { "epoch": 1.5755855216465577, "grad_norm": 2.2789103984832764, "learning_rate": 1.4964747614146895e-07, "loss": 0.0788, "step": 13320 }, { "epoch": 1.576768393659806, "grad_norm": 3.7964651584625244, "learning_rate": 1.4952321517979706e-07, "loss": 0.0752, "step": 13330 }, { "epoch": 1.5779512656730543, "grad_norm": 3.5235137939453125, "learning_rate": 1.4939895421812514e-07, "loss": 0.0734, "step": 13340 }, { "epoch": 1.5791341376863024, "grad_norm": 2.4348983764648438, "learning_rate": 1.4927469325645324e-07, "loss": 0.075, "step": 13350 }, { "epoch": 1.5803170096995505, "grad_norm": 2.912658214569092, "learning_rate": 1.4915043229478138e-07, "loss": 0.0816, "step": 13360 }, { "epoch": 1.5814998817127988, "grad_norm": 2.7602193355560303, "learning_rate": 1.4902617133310946e-07, "loss": 0.0677, "step": 13370 }, { "epoch": 1.5826827537260468, "grad_norm": 2.388660430908203, "learning_rate": 1.4890191037143756e-07, "loss": 0.0706, "step": 13380 }, { "epoch": 1.583865625739295, "grad_norm": 2.3998489379882812, "learning_rate": 1.4877764940976567e-07, "loss": 0.0788, "step": 13390 }, { "epoch": 1.5850484977525432, "grad_norm": 2.1093804836273193, "learning_rate": 1.4865338844809375e-07, "loss": 0.079, "step": 13400 }, { "epoch": 1.5862313697657915, "grad_norm": 2.5355138778686523, "learning_rate": 1.4852912748642186e-07, "loss": 0.0788, "step": 13410 }, { "epoch": 1.5874142417790396, "grad_norm": 3.400830030441284, "learning_rate": 1.4840486652474994e-07, "loss": 0.0757, "step": 13420 }, { "epoch": 1.5885971137922876, "grad_norm": 2.437443494796753, "learning_rate": 1.4828060556307804e-07, "loss": 0.0792, "step": 13430 }, { "epoch": 1.589779985805536, "grad_norm": 2.194887638092041, "learning_rate": 1.4815634460140617e-07, "loss": 0.0765, "step": 13440 }, { "epoch": 1.590962857818784, "grad_norm": 1.820046305656433, "learning_rate": 1.4803208363973428e-07, "loss": 0.0754, "step": 13450 }, { "epoch": 1.592145729832032, "grad_norm": 2.1535489559173584, "learning_rate": 1.4790782267806236e-07, "loss": 0.0705, "step": 13460 }, { "epoch": 1.5933286018452804, "grad_norm": 3.434300661087036, "learning_rate": 1.4778356171639047e-07, "loss": 0.0716, "step": 13470 }, { "epoch": 1.5945114738585286, "grad_norm": 2.8789334297180176, "learning_rate": 1.4765930075471855e-07, "loss": 0.0777, "step": 13480 }, { "epoch": 1.5956943458717767, "grad_norm": 2.0619235038757324, "learning_rate": 1.4753503979304665e-07, "loss": 0.0747, "step": 13490 }, { "epoch": 1.5968772178850248, "grad_norm": 2.6454806327819824, "learning_rate": 1.4741077883137476e-07, "loss": 0.0727, "step": 13500 }, { "epoch": 1.598060089898273, "grad_norm": 3.0681769847869873, "learning_rate": 1.4728651786970284e-07, "loss": 0.0773, "step": 13510 }, { "epoch": 1.5992429619115212, "grad_norm": 2.2493906021118164, "learning_rate": 1.4716225690803097e-07, "loss": 0.0716, "step": 13520 }, { "epoch": 1.6004258339247692, "grad_norm": 2.142681121826172, "learning_rate": 1.4703799594635908e-07, "loss": 0.0714, "step": 13530 }, { "epoch": 1.6016087059380175, "grad_norm": 2.37764573097229, "learning_rate": 1.4691373498468716e-07, "loss": 0.073, "step": 13540 }, { "epoch": 1.6027915779512658, "grad_norm": 2.3749160766601562, "learning_rate": 1.4678947402301527e-07, "loss": 0.0726, "step": 13550 }, { "epoch": 1.6039744499645139, "grad_norm": 2.5025763511657715, "learning_rate": 1.4666521306134337e-07, "loss": 0.0712, "step": 13560 }, { "epoch": 1.605157321977762, "grad_norm": 2.661851167678833, "learning_rate": 1.4654095209967145e-07, "loss": 0.0718, "step": 13570 }, { "epoch": 1.6063401939910102, "grad_norm": 2.2790133953094482, "learning_rate": 1.4641669113799956e-07, "loss": 0.0794, "step": 13580 }, { "epoch": 1.6075230660042583, "grad_norm": 1.922194480895996, "learning_rate": 1.4629243017632764e-07, "loss": 0.0765, "step": 13590 }, { "epoch": 1.6087059380175064, "grad_norm": 3.491849899291992, "learning_rate": 1.4616816921465577e-07, "loss": 0.0756, "step": 13600 }, { "epoch": 1.6098888100307547, "grad_norm": 1.9582055807113647, "learning_rate": 1.4604390825298388e-07, "loss": 0.0734, "step": 13610 }, { "epoch": 1.611071682044003, "grad_norm": 2.725914716720581, "learning_rate": 1.4591964729131196e-07, "loss": 0.0727, "step": 13620 }, { "epoch": 1.612254554057251, "grad_norm": 3.090667486190796, "learning_rate": 1.4579538632964006e-07, "loss": 0.0735, "step": 13630 }, { "epoch": 1.6134374260704991, "grad_norm": 2.926482677459717, "learning_rate": 1.4567112536796817e-07, "loss": 0.0707, "step": 13640 }, { "epoch": 1.6146202980837474, "grad_norm": 2.6732382774353027, "learning_rate": 1.4554686440629625e-07, "loss": 0.072, "step": 13650 }, { "epoch": 1.6158031700969955, "grad_norm": 2.2736141681671143, "learning_rate": 1.4542260344462436e-07, "loss": 0.0794, "step": 13660 }, { "epoch": 1.6169860421102435, "grad_norm": 3.4136788845062256, "learning_rate": 1.4529834248295246e-07, "loss": 0.0728, "step": 13670 }, { "epoch": 1.6181689141234918, "grad_norm": 2.256060838699341, "learning_rate": 1.4517408152128057e-07, "loss": 0.0715, "step": 13680 }, { "epoch": 1.6193517861367401, "grad_norm": 2.582105875015259, "learning_rate": 1.4504982055960868e-07, "loss": 0.0862, "step": 13690 }, { "epoch": 1.6205346581499882, "grad_norm": 3.2643513679504395, "learning_rate": 1.4492555959793678e-07, "loss": 0.081, "step": 13700 }, { "epoch": 1.6217175301632363, "grad_norm": 2.549550771713257, "learning_rate": 1.4480129863626486e-07, "loss": 0.0777, "step": 13710 }, { "epoch": 1.6229004021764846, "grad_norm": 2.973472833633423, "learning_rate": 1.4467703767459297e-07, "loss": 0.0737, "step": 13720 }, { "epoch": 1.6240832741897326, "grad_norm": 3.0125675201416016, "learning_rate": 1.4455277671292105e-07, "loss": 0.075, "step": 13730 }, { "epoch": 1.6252661462029807, "grad_norm": 2.1024320125579834, "learning_rate": 1.4442851575124915e-07, "loss": 0.0749, "step": 13740 }, { "epoch": 1.626449018216229, "grad_norm": 2.453429937362671, "learning_rate": 1.4430425478957726e-07, "loss": 0.0724, "step": 13750 }, { "epoch": 1.6276318902294773, "grad_norm": 2.720118999481201, "learning_rate": 1.4417999382790534e-07, "loss": 0.0779, "step": 13760 }, { "epoch": 1.6288147622427254, "grad_norm": 2.4228780269622803, "learning_rate": 1.4405573286623347e-07, "loss": 0.0798, "step": 13770 }, { "epoch": 1.6299976342559734, "grad_norm": 2.6631994247436523, "learning_rate": 1.4393147190456158e-07, "loss": 0.0658, "step": 13780 }, { "epoch": 1.6311805062692217, "grad_norm": 2.2503039836883545, "learning_rate": 1.4380721094288966e-07, "loss": 0.0722, "step": 13790 }, { "epoch": 1.6323633782824698, "grad_norm": 2.4487740993499756, "learning_rate": 1.4368294998121777e-07, "loss": 0.081, "step": 13800 }, { "epoch": 1.6335462502957179, "grad_norm": 2.866624593734741, "learning_rate": 1.4355868901954587e-07, "loss": 0.0697, "step": 13810 }, { "epoch": 1.6347291223089662, "grad_norm": 2.587797164916992, "learning_rate": 1.4343442805787395e-07, "loss": 0.0797, "step": 13820 }, { "epoch": 1.6359119943222145, "grad_norm": 3.7513763904571533, "learning_rate": 1.4331016709620206e-07, "loss": 0.0713, "step": 13830 }, { "epoch": 1.6370948663354625, "grad_norm": 3.426050901412964, "learning_rate": 1.4318590613453014e-07, "loss": 0.0811, "step": 13840 }, { "epoch": 1.6382777383487106, "grad_norm": 3.104196548461914, "learning_rate": 1.4306164517285827e-07, "loss": 0.0774, "step": 13850 }, { "epoch": 1.639460610361959, "grad_norm": 2.3884215354919434, "learning_rate": 1.4293738421118638e-07, "loss": 0.0734, "step": 13860 }, { "epoch": 1.640643482375207, "grad_norm": 2.522376537322998, "learning_rate": 1.4281312324951448e-07, "loss": 0.0753, "step": 13870 }, { "epoch": 1.641826354388455, "grad_norm": 2.7184536457061768, "learning_rate": 1.4268886228784256e-07, "loss": 0.0745, "step": 13880 }, { "epoch": 1.6430092264017033, "grad_norm": 2.0393552780151367, "learning_rate": 1.4256460132617067e-07, "loss": 0.0839, "step": 13890 }, { "epoch": 1.6441920984149516, "grad_norm": 3.770637273788452, "learning_rate": 1.4244034036449875e-07, "loss": 0.0842, "step": 13900 }, { "epoch": 1.6453749704281997, "grad_norm": 3.5441575050354004, "learning_rate": 1.4231607940282686e-07, "loss": 0.0733, "step": 13910 }, { "epoch": 1.6465578424414478, "grad_norm": 2.290829658508301, "learning_rate": 1.4219181844115496e-07, "loss": 0.0787, "step": 13920 }, { "epoch": 1.647740714454696, "grad_norm": 2.2517740726470947, "learning_rate": 1.4206755747948307e-07, "loss": 0.0664, "step": 13930 }, { "epoch": 1.6489235864679441, "grad_norm": 2.0012881755828857, "learning_rate": 1.4194329651781118e-07, "loss": 0.0723, "step": 13940 }, { "epoch": 1.6501064584811922, "grad_norm": 2.2125463485717773, "learning_rate": 1.4181903555613928e-07, "loss": 0.0775, "step": 13950 }, { "epoch": 1.6512893304944405, "grad_norm": 2.385274648666382, "learning_rate": 1.4169477459446736e-07, "loss": 0.0776, "step": 13960 }, { "epoch": 1.6524722025076888, "grad_norm": 2.468327045440674, "learning_rate": 1.4157051363279547e-07, "loss": 0.0742, "step": 13970 }, { "epoch": 1.6536550745209369, "grad_norm": 2.8094797134399414, "learning_rate": 1.4144625267112358e-07, "loss": 0.0727, "step": 13980 }, { "epoch": 1.654837946534185, "grad_norm": 2.4896485805511475, "learning_rate": 1.4132199170945168e-07, "loss": 0.0729, "step": 13990 }, { "epoch": 1.6560208185474332, "grad_norm": 1.8312571048736572, "learning_rate": 1.4119773074777976e-07, "loss": 0.0698, "step": 14000 }, { "epoch": 1.6572036905606813, "grad_norm": 2.390502452850342, "learning_rate": 1.4107346978610787e-07, "loss": 0.0726, "step": 14010 }, { "epoch": 1.6583865625739294, "grad_norm": 3.522059202194214, "learning_rate": 1.4094920882443597e-07, "loss": 0.0825, "step": 14020 }, { "epoch": 1.6595694345871776, "grad_norm": 3.6432313919067383, "learning_rate": 1.4082494786276408e-07, "loss": 0.0766, "step": 14030 }, { "epoch": 1.660752306600426, "grad_norm": 2.8957674503326416, "learning_rate": 1.4070068690109216e-07, "loss": 0.0721, "step": 14040 }, { "epoch": 1.661935178613674, "grad_norm": 2.6962873935699463, "learning_rate": 1.4057642593942027e-07, "loss": 0.0744, "step": 14050 }, { "epoch": 1.663118050626922, "grad_norm": 2.658935546875, "learning_rate": 1.4045216497774837e-07, "loss": 0.0743, "step": 14060 }, { "epoch": 1.6643009226401704, "grad_norm": 2.4486286640167236, "learning_rate": 1.4032790401607648e-07, "loss": 0.0716, "step": 14070 }, { "epoch": 1.6654837946534184, "grad_norm": 2.5091662406921387, "learning_rate": 1.4020364305440459e-07, "loss": 0.0761, "step": 14080 }, { "epoch": 1.6666666666666665, "grad_norm": 2.8873329162597656, "learning_rate": 1.4007938209273267e-07, "loss": 0.0685, "step": 14090 }, { "epoch": 1.6678495386799148, "grad_norm": 2.2191410064697266, "learning_rate": 1.3995512113106077e-07, "loss": 0.0747, "step": 14100 }, { "epoch": 1.669032410693163, "grad_norm": 2.562868595123291, "learning_rate": 1.3983086016938885e-07, "loss": 0.0781, "step": 14110 }, { "epoch": 1.6702152827064112, "grad_norm": 2.340390920639038, "learning_rate": 1.3970659920771699e-07, "loss": 0.0716, "step": 14120 }, { "epoch": 1.6713981547196592, "grad_norm": 3.645477294921875, "learning_rate": 1.3958233824604507e-07, "loss": 0.0809, "step": 14130 }, { "epoch": 1.6725810267329075, "grad_norm": 3.2264325618743896, "learning_rate": 1.3945807728437317e-07, "loss": 0.0743, "step": 14140 }, { "epoch": 1.6737638987461558, "grad_norm": 2.583566427230835, "learning_rate": 1.3933381632270125e-07, "loss": 0.0715, "step": 14150 }, { "epoch": 1.6749467707594037, "grad_norm": 3.8324971199035645, "learning_rate": 1.3920955536102938e-07, "loss": 0.071, "step": 14160 }, { "epoch": 1.676129642772652, "grad_norm": 2.207859754562378, "learning_rate": 1.3908529439935746e-07, "loss": 0.0778, "step": 14170 }, { "epoch": 1.6773125147859003, "grad_norm": 2.6080267429351807, "learning_rate": 1.3896103343768557e-07, "loss": 0.071, "step": 14180 }, { "epoch": 1.6784953867991483, "grad_norm": 2.897057294845581, "learning_rate": 1.3883677247601368e-07, "loss": 0.0786, "step": 14190 }, { "epoch": 1.6796782588123964, "grad_norm": 2.122075080871582, "learning_rate": 1.3871251151434178e-07, "loss": 0.074, "step": 14200 }, { "epoch": 1.6808611308256447, "grad_norm": 2.541294813156128, "learning_rate": 1.3858825055266986e-07, "loss": 0.0786, "step": 14210 }, { "epoch": 1.682044002838893, "grad_norm": 2.5583627223968506, "learning_rate": 1.3846398959099797e-07, "loss": 0.0793, "step": 14220 }, { "epoch": 1.6832268748521408, "grad_norm": 2.9530742168426514, "learning_rate": 1.3833972862932608e-07, "loss": 0.0718, "step": 14230 }, { "epoch": 1.6844097468653891, "grad_norm": 2.176906108856201, "learning_rate": 1.3821546766765418e-07, "loss": 0.0708, "step": 14240 }, { "epoch": 1.6855926188786374, "grad_norm": 2.3678524494171143, "learning_rate": 1.3809120670598226e-07, "loss": 0.0771, "step": 14250 }, { "epoch": 1.6867754908918855, "grad_norm": 2.139183521270752, "learning_rate": 1.3796694574431037e-07, "loss": 0.074, "step": 14260 }, { "epoch": 1.6879583629051336, "grad_norm": 2.8097918033599854, "learning_rate": 1.3784268478263848e-07, "loss": 0.0835, "step": 14270 }, { "epoch": 1.6891412349183819, "grad_norm": 2.301964521408081, "learning_rate": 1.3771842382096658e-07, "loss": 0.086, "step": 14280 }, { "epoch": 1.6903241069316302, "grad_norm": 2.338724136352539, "learning_rate": 1.375941628592947e-07, "loss": 0.0743, "step": 14290 }, { "epoch": 1.691506978944878, "grad_norm": 2.030856132507324, "learning_rate": 1.3746990189762277e-07, "loss": 0.0776, "step": 14300 }, { "epoch": 1.6926898509581263, "grad_norm": 2.916191816329956, "learning_rate": 1.3734564093595087e-07, "loss": 0.0747, "step": 14310 }, { "epoch": 1.6938727229713746, "grad_norm": 2.4341108798980713, "learning_rate": 1.3722137997427898e-07, "loss": 0.0763, "step": 14320 }, { "epoch": 1.6950555949846227, "grad_norm": 2.2478435039520264, "learning_rate": 1.370971190126071e-07, "loss": 0.0703, "step": 14330 }, { "epoch": 1.6962384669978707, "grad_norm": 2.028482675552368, "learning_rate": 1.3697285805093517e-07, "loss": 0.0798, "step": 14340 }, { "epoch": 1.697421339011119, "grad_norm": 1.8354445695877075, "learning_rate": 1.3684859708926327e-07, "loss": 0.0758, "step": 14350 }, { "epoch": 1.6986042110243673, "grad_norm": 2.6463825702667236, "learning_rate": 1.3672433612759138e-07, "loss": 0.0713, "step": 14360 }, { "epoch": 1.6997870830376152, "grad_norm": 2.2176547050476074, "learning_rate": 1.366000751659195e-07, "loss": 0.0707, "step": 14370 }, { "epoch": 1.7009699550508635, "grad_norm": 3.0243334770202637, "learning_rate": 1.3647581420424757e-07, "loss": 0.0739, "step": 14380 }, { "epoch": 1.7021528270641118, "grad_norm": 2.705674171447754, "learning_rate": 1.3635155324257567e-07, "loss": 0.0815, "step": 14390 }, { "epoch": 1.7033356990773598, "grad_norm": 2.541836738586426, "learning_rate": 1.3622729228090378e-07, "loss": 0.0765, "step": 14400 }, { "epoch": 1.704518571090608, "grad_norm": 2.603595018386841, "learning_rate": 1.3610303131923189e-07, "loss": 0.0708, "step": 14410 }, { "epoch": 1.7057014431038562, "grad_norm": 2.552635669708252, "learning_rate": 1.3597877035755997e-07, "loss": 0.0694, "step": 14420 }, { "epoch": 1.7068843151171045, "grad_norm": 3.0097949504852295, "learning_rate": 1.3585450939588807e-07, "loss": 0.0661, "step": 14430 }, { "epoch": 1.7080671871303525, "grad_norm": 1.9895873069763184, "learning_rate": 1.3573024843421618e-07, "loss": 0.0772, "step": 14440 }, { "epoch": 1.7092500591436006, "grad_norm": 2.684418201446533, "learning_rate": 1.3560598747254429e-07, "loss": 0.0805, "step": 14450 }, { "epoch": 1.710432931156849, "grad_norm": 3.0748753547668457, "learning_rate": 1.3548172651087236e-07, "loss": 0.074, "step": 14460 }, { "epoch": 1.711615803170097, "grad_norm": 2.7645063400268555, "learning_rate": 1.3535746554920047e-07, "loss": 0.071, "step": 14470 }, { "epoch": 1.712798675183345, "grad_norm": 2.584688901901245, "learning_rate": 1.3523320458752858e-07, "loss": 0.0738, "step": 14480 }, { "epoch": 1.7139815471965933, "grad_norm": 1.955483078956604, "learning_rate": 1.3510894362585668e-07, "loss": 0.074, "step": 14490 }, { "epoch": 1.7151644192098416, "grad_norm": 2.076632499694824, "learning_rate": 1.349846826641848e-07, "loss": 0.0648, "step": 14500 }, { "epoch": 1.7163472912230897, "grad_norm": 3.1437559127807617, "learning_rate": 1.3486042170251287e-07, "loss": 0.0759, "step": 14510 }, { "epoch": 1.7175301632363378, "grad_norm": 2.5450212955474854, "learning_rate": 1.3473616074084098e-07, "loss": 0.0787, "step": 14520 }, { "epoch": 1.718713035249586, "grad_norm": 2.6639461517333984, "learning_rate": 1.3461189977916908e-07, "loss": 0.0684, "step": 14530 }, { "epoch": 1.7198959072628341, "grad_norm": 2.3314599990844727, "learning_rate": 1.344876388174972e-07, "loss": 0.0771, "step": 14540 }, { "epoch": 1.7210787792760822, "grad_norm": 2.5527286529541016, "learning_rate": 1.3436337785582527e-07, "loss": 0.0762, "step": 14550 }, { "epoch": 1.7222616512893305, "grad_norm": 2.5854313373565674, "learning_rate": 1.3423911689415338e-07, "loss": 0.0804, "step": 14560 }, { "epoch": 1.7234445233025788, "grad_norm": 2.3305914402008057, "learning_rate": 1.3411485593248148e-07, "loss": 0.0723, "step": 14570 }, { "epoch": 1.7246273953158269, "grad_norm": 3.2173142433166504, "learning_rate": 1.339905949708096e-07, "loss": 0.0801, "step": 14580 }, { "epoch": 1.725810267329075, "grad_norm": 1.9679092168807983, "learning_rate": 1.3386633400913767e-07, "loss": 0.069, "step": 14590 }, { "epoch": 1.7269931393423232, "grad_norm": 2.32478928565979, "learning_rate": 1.3374207304746578e-07, "loss": 0.0826, "step": 14600 }, { "epoch": 1.7281760113555713, "grad_norm": 1.8886853456497192, "learning_rate": 1.3361781208579388e-07, "loss": 0.074, "step": 14610 }, { "epoch": 1.7293588833688194, "grad_norm": 2.6425673961639404, "learning_rate": 1.33493551124122e-07, "loss": 0.0766, "step": 14620 }, { "epoch": 1.7305417553820677, "grad_norm": 2.863003969192505, "learning_rate": 1.3336929016245007e-07, "loss": 0.0737, "step": 14630 }, { "epoch": 1.731724627395316, "grad_norm": 2.0983359813690186, "learning_rate": 1.3324502920077817e-07, "loss": 0.0728, "step": 14640 }, { "epoch": 1.732907499408564, "grad_norm": 2.86582350730896, "learning_rate": 1.3312076823910628e-07, "loss": 0.0773, "step": 14650 }, { "epoch": 1.734090371421812, "grad_norm": 3.84641432762146, "learning_rate": 1.329965072774344e-07, "loss": 0.0708, "step": 14660 }, { "epoch": 1.7352732434350604, "grad_norm": 2.373185157775879, "learning_rate": 1.3287224631576247e-07, "loss": 0.0794, "step": 14670 }, { "epoch": 1.7364561154483085, "grad_norm": 2.7937777042388916, "learning_rate": 1.3274798535409057e-07, "loss": 0.0836, "step": 14680 }, { "epoch": 1.7376389874615565, "grad_norm": 2.3328065872192383, "learning_rate": 1.3262372439241868e-07, "loss": 0.0772, "step": 14690 }, { "epoch": 1.7388218594748048, "grad_norm": 2.966446876525879, "learning_rate": 1.3249946343074679e-07, "loss": 0.0793, "step": 14700 }, { "epoch": 1.7400047314880531, "grad_norm": 1.6382108926773071, "learning_rate": 1.323752024690749e-07, "loss": 0.0721, "step": 14710 }, { "epoch": 1.7411876035013012, "grad_norm": 2.753506660461426, "learning_rate": 1.3225094150740297e-07, "loss": 0.0684, "step": 14720 }, { "epoch": 1.7423704755145493, "grad_norm": 2.711510419845581, "learning_rate": 1.3212668054573108e-07, "loss": 0.0712, "step": 14730 }, { "epoch": 1.7435533475277976, "grad_norm": 2.3088347911834717, "learning_rate": 1.3200241958405919e-07, "loss": 0.0774, "step": 14740 }, { "epoch": 1.7447362195410456, "grad_norm": 2.8262927532196045, "learning_rate": 1.318781586223873e-07, "loss": 0.0684, "step": 14750 }, { "epoch": 1.7459190915542937, "grad_norm": 2.0631659030914307, "learning_rate": 1.3175389766071537e-07, "loss": 0.0707, "step": 14760 }, { "epoch": 1.747101963567542, "grad_norm": 2.741055488586426, "learning_rate": 1.3162963669904348e-07, "loss": 0.0702, "step": 14770 }, { "epoch": 1.7482848355807903, "grad_norm": 2.5778651237487793, "learning_rate": 1.3150537573737158e-07, "loss": 0.0753, "step": 14780 }, { "epoch": 1.7494677075940384, "grad_norm": 2.4155473709106445, "learning_rate": 1.313811147756997e-07, "loss": 0.0714, "step": 14790 }, { "epoch": 1.7506505796072864, "grad_norm": 2.758178472518921, "learning_rate": 1.3125685381402777e-07, "loss": 0.0793, "step": 14800 }, { "epoch": 1.7518334516205347, "grad_norm": 2.182459831237793, "learning_rate": 1.3113259285235588e-07, "loss": 0.0656, "step": 14810 }, { "epoch": 1.7530163236337828, "grad_norm": 3.2857511043548584, "learning_rate": 1.3100833189068398e-07, "loss": 0.0797, "step": 14820 }, { "epoch": 1.7541991956470309, "grad_norm": 2.2547106742858887, "learning_rate": 1.308840709290121e-07, "loss": 0.0714, "step": 14830 }, { "epoch": 1.7553820676602792, "grad_norm": 2.299694061279297, "learning_rate": 1.3075980996734017e-07, "loss": 0.0679, "step": 14840 }, { "epoch": 1.7565649396735274, "grad_norm": 2.1992175579071045, "learning_rate": 1.306355490056683e-07, "loss": 0.0771, "step": 14850 }, { "epoch": 1.7577478116867755, "grad_norm": 2.5159342288970947, "learning_rate": 1.3051128804399638e-07, "loss": 0.0854, "step": 14860 }, { "epoch": 1.7589306837000236, "grad_norm": 2.663545846939087, "learning_rate": 1.303870270823245e-07, "loss": 0.0691, "step": 14870 }, { "epoch": 1.7601135557132719, "grad_norm": 2.5593173503875732, "learning_rate": 1.3026276612065257e-07, "loss": 0.072, "step": 14880 }, { "epoch": 1.76129642772652, "grad_norm": 2.6101737022399902, "learning_rate": 1.3013850515898068e-07, "loss": 0.0791, "step": 14890 }, { "epoch": 1.762479299739768, "grad_norm": 2.2886903285980225, "learning_rate": 1.3001424419730878e-07, "loss": 0.0736, "step": 14900 }, { "epoch": 1.7636621717530163, "grad_norm": 3.213513135910034, "learning_rate": 1.298899832356369e-07, "loss": 0.0727, "step": 14910 }, { "epoch": 1.7648450437662646, "grad_norm": 2.4075605869293213, "learning_rate": 1.29765722273965e-07, "loss": 0.0693, "step": 14920 }, { "epoch": 1.7660279157795127, "grad_norm": 2.799640417098999, "learning_rate": 1.2964146131229307e-07, "loss": 0.0707, "step": 14930 }, { "epoch": 1.7672107877927608, "grad_norm": 2.5529751777648926, "learning_rate": 1.2951720035062118e-07, "loss": 0.0754, "step": 14940 }, { "epoch": 1.768393659806009, "grad_norm": 2.876215696334839, "learning_rate": 1.293929393889493e-07, "loss": 0.0775, "step": 14950 }, { "epoch": 1.7695765318192571, "grad_norm": 3.2448606491088867, "learning_rate": 1.292686784272774e-07, "loss": 0.0755, "step": 14960 }, { "epoch": 1.7707594038325052, "grad_norm": 3.125678062438965, "learning_rate": 1.2914441746560547e-07, "loss": 0.0737, "step": 14970 }, { "epoch": 1.7719422758457535, "grad_norm": 2.6102302074432373, "learning_rate": 1.2902015650393358e-07, "loss": 0.068, "step": 14980 }, { "epoch": 1.7731251478590018, "grad_norm": 2.857900857925415, "learning_rate": 1.2889589554226169e-07, "loss": 0.0765, "step": 14990 }, { "epoch": 1.7743080198722498, "grad_norm": 2.11435604095459, "learning_rate": 1.287716345805898e-07, "loss": 0.0755, "step": 15000 }, { "epoch": 1.775490891885498, "grad_norm": 2.4323062896728516, "learning_rate": 1.2864737361891787e-07, "loss": 0.0746, "step": 15010 }, { "epoch": 1.7766737638987462, "grad_norm": 2.738417148590088, "learning_rate": 1.2852311265724598e-07, "loss": 0.0776, "step": 15020 }, { "epoch": 1.7778566359119943, "grad_norm": 2.6986234188079834, "learning_rate": 1.2839885169557409e-07, "loss": 0.0747, "step": 15030 }, { "epoch": 1.7790395079252423, "grad_norm": 2.5771241188049316, "learning_rate": 1.282745907339022e-07, "loss": 0.0798, "step": 15040 }, { "epoch": 1.7802223799384906, "grad_norm": 2.264975070953369, "learning_rate": 1.2815032977223027e-07, "loss": 0.0794, "step": 15050 }, { "epoch": 1.781405251951739, "grad_norm": 2.2888023853302, "learning_rate": 1.280260688105584e-07, "loss": 0.0713, "step": 15060 }, { "epoch": 1.782588123964987, "grad_norm": 3.344294309616089, "learning_rate": 1.2790180784888648e-07, "loss": 0.0767, "step": 15070 }, { "epoch": 1.783770995978235, "grad_norm": 2.6318013668060303, "learning_rate": 1.277775468872146e-07, "loss": 0.0748, "step": 15080 }, { "epoch": 1.7849538679914834, "grad_norm": 2.235225200653076, "learning_rate": 1.2765328592554267e-07, "loss": 0.0793, "step": 15090 }, { "epoch": 1.7861367400047314, "grad_norm": 3.1042349338531494, "learning_rate": 1.275290249638708e-07, "loss": 0.0695, "step": 15100 }, { "epoch": 1.7873196120179795, "grad_norm": 2.8405821323394775, "learning_rate": 1.2740476400219888e-07, "loss": 0.0724, "step": 15110 }, { "epoch": 1.7885024840312278, "grad_norm": 3.333775281906128, "learning_rate": 1.27280503040527e-07, "loss": 0.0793, "step": 15120 }, { "epoch": 1.789685356044476, "grad_norm": 2.8468618392944336, "learning_rate": 1.271562420788551e-07, "loss": 0.0778, "step": 15130 }, { "epoch": 1.7908682280577242, "grad_norm": 2.659320831298828, "learning_rate": 1.270319811171832e-07, "loss": 0.0884, "step": 15140 }, { "epoch": 1.7920511000709722, "grad_norm": 2.34031343460083, "learning_rate": 1.2690772015551128e-07, "loss": 0.0803, "step": 15150 }, { "epoch": 1.7932339720842205, "grad_norm": 2.469794511795044, "learning_rate": 1.267834591938394e-07, "loss": 0.068, "step": 15160 }, { "epoch": 1.7944168440974686, "grad_norm": 2.642333984375, "learning_rate": 1.266591982321675e-07, "loss": 0.067, "step": 15170 }, { "epoch": 1.7955997161107167, "grad_norm": 2.2932868003845215, "learning_rate": 1.265349372704956e-07, "loss": 0.075, "step": 15180 }, { "epoch": 1.796782588123965, "grad_norm": 1.9693217277526855, "learning_rate": 1.2641067630882368e-07, "loss": 0.0726, "step": 15190 }, { "epoch": 1.7979654601372133, "grad_norm": 2.656754732131958, "learning_rate": 1.262864153471518e-07, "loss": 0.0745, "step": 15200 }, { "epoch": 1.7991483321504613, "grad_norm": 2.6651132106781006, "learning_rate": 1.261621543854799e-07, "loss": 0.0828, "step": 15210 }, { "epoch": 1.8003312041637094, "grad_norm": 2.5755698680877686, "learning_rate": 1.26037893423808e-07, "loss": 0.0739, "step": 15220 }, { "epoch": 1.8015140761769577, "grad_norm": 2.8317654132843018, "learning_rate": 1.2591363246213608e-07, "loss": 0.0706, "step": 15230 }, { "epoch": 1.802696948190206, "grad_norm": 9.941123008728027, "learning_rate": 1.257893715004642e-07, "loss": 0.083, "step": 15240 }, { "epoch": 1.8038798202034538, "grad_norm": 3.1002049446105957, "learning_rate": 1.256651105387923e-07, "loss": 0.071, "step": 15250 }, { "epoch": 1.8050626922167021, "grad_norm": 3.0838513374328613, "learning_rate": 1.2554084957712037e-07, "loss": 0.0729, "step": 15260 }, { "epoch": 1.8062455642299504, "grad_norm": 2.337399959564209, "learning_rate": 1.254165886154485e-07, "loss": 0.0712, "step": 15270 }, { "epoch": 1.8074284362431985, "grad_norm": 2.784607410430908, "learning_rate": 1.2529232765377659e-07, "loss": 0.0749, "step": 15280 }, { "epoch": 1.8086113082564466, "grad_norm": 2.1036794185638428, "learning_rate": 1.251680666921047e-07, "loss": 0.0739, "step": 15290 }, { "epoch": 1.8097941802696949, "grad_norm": 2.2835886478424072, "learning_rate": 1.2504380573043277e-07, "loss": 0.072, "step": 15300 }, { "epoch": 1.8109770522829431, "grad_norm": 4.758540153503418, "learning_rate": 1.249195447687609e-07, "loss": 0.0766, "step": 15310 }, { "epoch": 1.812159924296191, "grad_norm": 3.056859254837036, "learning_rate": 1.2479528380708899e-07, "loss": 0.073, "step": 15320 }, { "epoch": 1.8133427963094393, "grad_norm": 2.700416326522827, "learning_rate": 1.246710228454171e-07, "loss": 0.0769, "step": 15330 }, { "epoch": 1.8145256683226876, "grad_norm": 2.127960443496704, "learning_rate": 1.245467618837452e-07, "loss": 0.0717, "step": 15340 }, { "epoch": 1.8157085403359357, "grad_norm": 2.194378614425659, "learning_rate": 1.244225009220733e-07, "loss": 0.0804, "step": 15350 }, { "epoch": 1.8168914123491837, "grad_norm": 4.554673194885254, "learning_rate": 1.2429823996040138e-07, "loss": 0.0769, "step": 15360 }, { "epoch": 1.818074284362432, "grad_norm": 1.9140733480453491, "learning_rate": 1.241739789987295e-07, "loss": 0.0717, "step": 15370 }, { "epoch": 1.8192571563756803, "grad_norm": 2.9004554748535156, "learning_rate": 1.240497180370576e-07, "loss": 0.0693, "step": 15380 }, { "epoch": 1.8204400283889282, "grad_norm": 2.4746837615966797, "learning_rate": 1.239254570753857e-07, "loss": 0.0836, "step": 15390 }, { "epoch": 1.8216229004021764, "grad_norm": 2.7645483016967773, "learning_rate": 1.2380119611371378e-07, "loss": 0.0756, "step": 15400 }, { "epoch": 1.8228057724154247, "grad_norm": 2.8961758613586426, "learning_rate": 1.236769351520419e-07, "loss": 0.0814, "step": 15410 }, { "epoch": 1.8239886444286728, "grad_norm": 2.551079750061035, "learning_rate": 1.2355267419037e-07, "loss": 0.0746, "step": 15420 }, { "epoch": 1.8251715164419209, "grad_norm": 2.563857316970825, "learning_rate": 1.234284132286981e-07, "loss": 0.0588, "step": 15430 }, { "epoch": 1.8263543884551692, "grad_norm": 1.984527349472046, "learning_rate": 1.2330415226702618e-07, "loss": 0.0719, "step": 15440 }, { "epoch": 1.8275372604684175, "grad_norm": 2.696474313735962, "learning_rate": 1.231798913053543e-07, "loss": 0.0774, "step": 15450 }, { "epoch": 1.8287201324816655, "grad_norm": 3.4985930919647217, "learning_rate": 1.230556303436824e-07, "loss": 0.0724, "step": 15460 }, { "epoch": 1.8299030044949136, "grad_norm": 2.3859703540802, "learning_rate": 1.229313693820105e-07, "loss": 0.0716, "step": 15470 }, { "epoch": 1.831085876508162, "grad_norm": 2.7106072902679443, "learning_rate": 1.228071084203386e-07, "loss": 0.0707, "step": 15480 }, { "epoch": 1.83226874852141, "grad_norm": 2.7034785747528076, "learning_rate": 1.226828474586667e-07, "loss": 0.0697, "step": 15490 }, { "epoch": 1.833451620534658, "grad_norm": 2.2995712757110596, "learning_rate": 1.225585864969948e-07, "loss": 0.0759, "step": 15500 }, { "epoch": 1.8346344925479063, "grad_norm": 3.5761008262634277, "learning_rate": 1.224343255353229e-07, "loss": 0.0711, "step": 15510 }, { "epoch": 1.8358173645611546, "grad_norm": 1.8410067558288574, "learning_rate": 1.22310064573651e-07, "loss": 0.0714, "step": 15520 }, { "epoch": 1.8370002365744027, "grad_norm": 2.145275831222534, "learning_rate": 1.221858036119791e-07, "loss": 0.0756, "step": 15530 }, { "epoch": 1.8381831085876508, "grad_norm": 2.228370428085327, "learning_rate": 1.220615426503072e-07, "loss": 0.0736, "step": 15540 }, { "epoch": 1.839365980600899, "grad_norm": 2.356933832168579, "learning_rate": 1.219372816886353e-07, "loss": 0.0774, "step": 15550 }, { "epoch": 1.8405488526141471, "grad_norm": 3.513478994369507, "learning_rate": 1.218130207269634e-07, "loss": 0.0831, "step": 15560 }, { "epoch": 1.8417317246273952, "grad_norm": 2.626217842102051, "learning_rate": 1.2168875976529149e-07, "loss": 0.075, "step": 15570 }, { "epoch": 1.8429145966406435, "grad_norm": 2.7644529342651367, "learning_rate": 1.215644988036196e-07, "loss": 0.0747, "step": 15580 }, { "epoch": 1.8440974686538918, "grad_norm": 2.0679311752319336, "learning_rate": 1.214402378419477e-07, "loss": 0.0749, "step": 15590 }, { "epoch": 1.8452803406671399, "grad_norm": 3.131085157394409, "learning_rate": 1.213159768802758e-07, "loss": 0.0782, "step": 15600 }, { "epoch": 1.846463212680388, "grad_norm": 3.005885124206543, "learning_rate": 1.2119171591860389e-07, "loss": 0.0725, "step": 15610 }, { "epoch": 1.8476460846936362, "grad_norm": 3.1692936420440674, "learning_rate": 1.21067454956932e-07, "loss": 0.0697, "step": 15620 }, { "epoch": 1.8488289567068843, "grad_norm": 2.4710347652435303, "learning_rate": 1.209431939952601e-07, "loss": 0.0707, "step": 15630 }, { "epoch": 1.8500118287201324, "grad_norm": 2.253418445587158, "learning_rate": 1.208189330335882e-07, "loss": 0.0751, "step": 15640 }, { "epoch": 1.8511947007333807, "grad_norm": 2.6318488121032715, "learning_rate": 1.2069467207191628e-07, "loss": 0.0819, "step": 15650 }, { "epoch": 1.852377572746629, "grad_norm": 2.588752031326294, "learning_rate": 1.205704111102444e-07, "loss": 0.0736, "step": 15660 }, { "epoch": 1.853560444759877, "grad_norm": 2.2455599308013916, "learning_rate": 1.204461501485725e-07, "loss": 0.0756, "step": 15670 }, { "epoch": 1.854743316773125, "grad_norm": 2.68007493019104, "learning_rate": 1.203218891869006e-07, "loss": 0.0746, "step": 15680 }, { "epoch": 1.8559261887863734, "grad_norm": 2.154252052307129, "learning_rate": 1.201976282252287e-07, "loss": 0.0792, "step": 15690 }, { "epoch": 1.8571090607996215, "grad_norm": 2.3445885181427, "learning_rate": 1.200733672635568e-07, "loss": 0.0722, "step": 15700 }, { "epoch": 1.8582919328128695, "grad_norm": 2.8054234981536865, "learning_rate": 1.199491063018849e-07, "loss": 0.0674, "step": 15710 }, { "epoch": 1.8594748048261178, "grad_norm": 2.5975863933563232, "learning_rate": 1.19824845340213e-07, "loss": 0.0704, "step": 15720 }, { "epoch": 1.8606576768393661, "grad_norm": 2.4161887168884277, "learning_rate": 1.197005843785411e-07, "loss": 0.0718, "step": 15730 }, { "epoch": 1.8618405488526142, "grad_norm": 1.9876720905303955, "learning_rate": 1.195763234168692e-07, "loss": 0.0804, "step": 15740 }, { "epoch": 1.8630234208658623, "grad_norm": 4.090278625488281, "learning_rate": 1.194520624551973e-07, "loss": 0.0774, "step": 15750 }, { "epoch": 1.8642062928791105, "grad_norm": 2.7017483711242676, "learning_rate": 1.193278014935254e-07, "loss": 0.077, "step": 15760 }, { "epoch": 1.8653891648923586, "grad_norm": 3.195319652557373, "learning_rate": 1.192035405318535e-07, "loss": 0.0754, "step": 15770 }, { "epoch": 1.8665720369056067, "grad_norm": 4.165862560272217, "learning_rate": 1.190792795701816e-07, "loss": 0.076, "step": 15780 }, { "epoch": 1.867754908918855, "grad_norm": 2.1842246055603027, "learning_rate": 1.189550186085097e-07, "loss": 0.085, "step": 15790 }, { "epoch": 1.8689377809321033, "grad_norm": 2.9729127883911133, "learning_rate": 1.188307576468378e-07, "loss": 0.0778, "step": 15800 }, { "epoch": 1.8701206529453513, "grad_norm": 2.3865044116973877, "learning_rate": 1.1870649668516591e-07, "loss": 0.0835, "step": 15810 }, { "epoch": 1.8713035249585994, "grad_norm": 3.5952131748199463, "learning_rate": 1.18582235723494e-07, "loss": 0.0702, "step": 15820 }, { "epoch": 1.8724863969718477, "grad_norm": 2.5415637493133545, "learning_rate": 1.184579747618221e-07, "loss": 0.0716, "step": 15830 }, { "epoch": 1.8736692689850958, "grad_norm": 2.3129422664642334, "learning_rate": 1.1833371380015021e-07, "loss": 0.0723, "step": 15840 }, { "epoch": 1.8748521409983439, "grad_norm": 2.9883837699890137, "learning_rate": 1.1820945283847831e-07, "loss": 0.0709, "step": 15850 }, { "epoch": 1.8760350130115921, "grad_norm": 2.6807971000671387, "learning_rate": 1.180851918768064e-07, "loss": 0.0788, "step": 15860 }, { "epoch": 1.8772178850248404, "grad_norm": 2.352095127105713, "learning_rate": 1.1796093091513449e-07, "loss": 0.0713, "step": 15870 }, { "epoch": 1.8784007570380885, "grad_norm": 3.53236985206604, "learning_rate": 1.1783666995346261e-07, "loss": 0.0717, "step": 15880 }, { "epoch": 1.8795836290513366, "grad_norm": 3.207129716873169, "learning_rate": 1.177124089917907e-07, "loss": 0.0661, "step": 15890 }, { "epoch": 1.8807665010645849, "grad_norm": 3.3713278770446777, "learning_rate": 1.175881480301188e-07, "loss": 0.0805, "step": 15900 }, { "epoch": 1.881949373077833, "grad_norm": 2.155369281768799, "learning_rate": 1.1746388706844689e-07, "loss": 0.0773, "step": 15910 }, { "epoch": 1.883132245091081, "grad_norm": 2.7687926292419434, "learning_rate": 1.1733962610677501e-07, "loss": 0.0789, "step": 15920 }, { "epoch": 1.8843151171043293, "grad_norm": 2.533585548400879, "learning_rate": 1.172153651451031e-07, "loss": 0.0687, "step": 15930 }, { "epoch": 1.8854979891175776, "grad_norm": 2.1938636302948, "learning_rate": 1.170911041834312e-07, "loss": 0.0623, "step": 15940 }, { "epoch": 1.8866808611308257, "grad_norm": 3.7129251956939697, "learning_rate": 1.169668432217593e-07, "loss": 0.0751, "step": 15950 }, { "epoch": 1.8878637331440737, "grad_norm": 3.0630640983581543, "learning_rate": 1.1684258226008741e-07, "loss": 0.0709, "step": 15960 }, { "epoch": 1.889046605157322, "grad_norm": 2.6198437213897705, "learning_rate": 1.167183212984155e-07, "loss": 0.0729, "step": 15970 }, { "epoch": 1.89022947717057, "grad_norm": 3.405400514602661, "learning_rate": 1.165940603367436e-07, "loss": 0.0684, "step": 15980 }, { "epoch": 1.8914123491838182, "grad_norm": 2.269127130508423, "learning_rate": 1.164697993750717e-07, "loss": 0.0835, "step": 15990 }, { "epoch": 1.8925952211970665, "grad_norm": 2.35459566116333, "learning_rate": 1.1634553841339981e-07, "loss": 0.0679, "step": 16000 }, { "epoch": 1.8937780932103148, "grad_norm": 2.456699848175049, "learning_rate": 1.162212774517279e-07, "loss": 0.077, "step": 16010 }, { "epoch": 1.8949609652235628, "grad_norm": 2.532166004180908, "learning_rate": 1.1609701649005601e-07, "loss": 0.0646, "step": 16020 }, { "epoch": 1.896143837236811, "grad_norm": 2.484231948852539, "learning_rate": 1.159727555283841e-07, "loss": 0.0737, "step": 16030 }, { "epoch": 1.8973267092500592, "grad_norm": 2.6336803436279297, "learning_rate": 1.158484945667122e-07, "loss": 0.071, "step": 16040 }, { "epoch": 1.8985095812633073, "grad_norm": 2.4317564964294434, "learning_rate": 1.1572423360504032e-07, "loss": 0.0706, "step": 16050 }, { "epoch": 1.8996924532765553, "grad_norm": 2.858665943145752, "learning_rate": 1.1559997264336841e-07, "loss": 0.0838, "step": 16060 }, { "epoch": 1.9008753252898036, "grad_norm": 2.762697458267212, "learning_rate": 1.154757116816965e-07, "loss": 0.0755, "step": 16070 }, { "epoch": 1.902058197303052, "grad_norm": 2.0858314037323, "learning_rate": 1.153514507200246e-07, "loss": 0.0776, "step": 16080 }, { "epoch": 1.9032410693163, "grad_norm": 2.80511474609375, "learning_rate": 1.1522718975835271e-07, "loss": 0.0833, "step": 16090 }, { "epoch": 1.904423941329548, "grad_norm": 2.482387065887451, "learning_rate": 1.1510292879668081e-07, "loss": 0.0765, "step": 16100 }, { "epoch": 1.9056068133427964, "grad_norm": 2.6057660579681396, "learning_rate": 1.149786678350089e-07, "loss": 0.0828, "step": 16110 }, { "epoch": 1.9067896853560444, "grad_norm": 2.5692296028137207, "learning_rate": 1.14854406873337e-07, "loss": 0.079, "step": 16120 }, { "epoch": 1.9079725573692925, "grad_norm": 2.712559700012207, "learning_rate": 1.1473014591166511e-07, "loss": 0.0714, "step": 16130 }, { "epoch": 1.9091554293825408, "grad_norm": 2.2741641998291016, "learning_rate": 1.1460588494999321e-07, "loss": 0.0801, "step": 16140 }, { "epoch": 1.910338301395789, "grad_norm": 2.5962018966674805, "learning_rate": 1.144816239883213e-07, "loss": 0.0706, "step": 16150 }, { "epoch": 1.9115211734090372, "grad_norm": 2.2367544174194336, "learning_rate": 1.143573630266494e-07, "loss": 0.0718, "step": 16160 }, { "epoch": 1.9127040454222852, "grad_norm": 2.3612186908721924, "learning_rate": 1.1423310206497751e-07, "loss": 0.0687, "step": 16170 }, { "epoch": 1.9138869174355335, "grad_norm": 2.5002167224884033, "learning_rate": 1.141088411033056e-07, "loss": 0.0652, "step": 16180 }, { "epoch": 1.9150697894487816, "grad_norm": 2.5858945846557617, "learning_rate": 1.139845801416337e-07, "loss": 0.0889, "step": 16190 }, { "epoch": 1.9162526614620297, "grad_norm": 2.6080098152160645, "learning_rate": 1.138603191799618e-07, "loss": 0.0734, "step": 16200 }, { "epoch": 1.917435533475278, "grad_norm": 3.721014976501465, "learning_rate": 1.1373605821828991e-07, "loss": 0.0672, "step": 16210 }, { "epoch": 1.9186184054885262, "grad_norm": 2.7162938117980957, "learning_rate": 1.13611797256618e-07, "loss": 0.0716, "step": 16220 }, { "epoch": 1.9198012775017743, "grad_norm": 2.6072299480438232, "learning_rate": 1.1348753629494611e-07, "loss": 0.0654, "step": 16230 }, { "epoch": 1.9209841495150224, "grad_norm": 3.1235175132751465, "learning_rate": 1.133632753332742e-07, "loss": 0.0781, "step": 16240 }, { "epoch": 1.9221670215282707, "grad_norm": 2.9244959354400635, "learning_rate": 1.1323901437160231e-07, "loss": 0.0862, "step": 16250 }, { "epoch": 1.923349893541519, "grad_norm": 2.7050564289093018, "learning_rate": 1.1311475340993042e-07, "loss": 0.0673, "step": 16260 }, { "epoch": 1.9245327655547668, "grad_norm": 3.9330360889434814, "learning_rate": 1.1299049244825851e-07, "loss": 0.0661, "step": 16270 }, { "epoch": 1.9257156375680151, "grad_norm": 2.306513786315918, "learning_rate": 1.128662314865866e-07, "loss": 0.073, "step": 16280 }, { "epoch": 1.9268985095812634, "grad_norm": 2.48642635345459, "learning_rate": 1.1274197052491471e-07, "loss": 0.0736, "step": 16290 }, { "epoch": 1.9280813815945115, "grad_norm": 2.7289185523986816, "learning_rate": 1.1261770956324282e-07, "loss": 0.0836, "step": 16300 }, { "epoch": 1.9292642536077595, "grad_norm": 2.1320013999938965, "learning_rate": 1.1249344860157091e-07, "loss": 0.0704, "step": 16310 }, { "epoch": 1.9304471256210078, "grad_norm": 3.043761968612671, "learning_rate": 1.12369187639899e-07, "loss": 0.0756, "step": 16320 }, { "epoch": 1.9316299976342561, "grad_norm": 3.174391508102417, "learning_rate": 1.1224492667822712e-07, "loss": 0.0747, "step": 16330 }, { "epoch": 1.932812869647504, "grad_norm": 2.7276394367218018, "learning_rate": 1.1212066571655522e-07, "loss": 0.0771, "step": 16340 }, { "epoch": 1.9339957416607523, "grad_norm": 2.3127124309539795, "learning_rate": 1.1199640475488331e-07, "loss": 0.0689, "step": 16350 }, { "epoch": 1.9351786136740006, "grad_norm": 2.3936338424682617, "learning_rate": 1.118721437932114e-07, "loss": 0.0732, "step": 16360 }, { "epoch": 1.9363614856872486, "grad_norm": 2.5432140827178955, "learning_rate": 1.1174788283153952e-07, "loss": 0.0726, "step": 16370 }, { "epoch": 1.9375443577004967, "grad_norm": 3.1328258514404297, "learning_rate": 1.1162362186986761e-07, "loss": 0.0738, "step": 16380 }, { "epoch": 1.938727229713745, "grad_norm": 2.036482334136963, "learning_rate": 1.1149936090819571e-07, "loss": 0.0709, "step": 16390 }, { "epoch": 1.9399101017269933, "grad_norm": 2.1682028770446777, "learning_rate": 1.113750999465238e-07, "loss": 0.0831, "step": 16400 }, { "epoch": 1.9410929737402411, "grad_norm": 2.8902812004089355, "learning_rate": 1.1125083898485192e-07, "loss": 0.0783, "step": 16410 }, { "epoch": 1.9422758457534894, "grad_norm": 3.0247395038604736, "learning_rate": 1.1112657802318001e-07, "loss": 0.0786, "step": 16420 }, { "epoch": 1.9434587177667377, "grad_norm": 3.031216621398926, "learning_rate": 1.1100231706150811e-07, "loss": 0.0762, "step": 16430 }, { "epoch": 1.9446415897799858, "grad_norm": 2.005890369415283, "learning_rate": 1.1087805609983621e-07, "loss": 0.0806, "step": 16440 }, { "epoch": 1.9458244617932339, "grad_norm": 2.7851336002349854, "learning_rate": 1.107537951381643e-07, "loss": 0.0743, "step": 16450 }, { "epoch": 1.9470073338064822, "grad_norm": 3.3901100158691406, "learning_rate": 1.1062953417649241e-07, "loss": 0.0716, "step": 16460 }, { "epoch": 1.9481902058197305, "grad_norm": 2.7525343894958496, "learning_rate": 1.1050527321482052e-07, "loss": 0.0649, "step": 16470 }, { "epoch": 1.9493730778329783, "grad_norm": 3.3134188652038574, "learning_rate": 1.1038101225314861e-07, "loss": 0.0724, "step": 16480 }, { "epoch": 1.9505559498462266, "grad_norm": 2.20290207862854, "learning_rate": 1.102567512914767e-07, "loss": 0.0769, "step": 16490 }, { "epoch": 1.951738821859475, "grad_norm": 1.8754603862762451, "learning_rate": 1.1013249032980481e-07, "loss": 0.0808, "step": 16500 }, { "epoch": 1.952921693872723, "grad_norm": 2.433760404586792, "learning_rate": 1.1000822936813292e-07, "loss": 0.062, "step": 16510 }, { "epoch": 1.954104565885971, "grad_norm": 2.3277528285980225, "learning_rate": 1.0988396840646101e-07, "loss": 0.0732, "step": 16520 }, { "epoch": 1.9552874378992193, "grad_norm": 2.291161060333252, "learning_rate": 1.097597074447891e-07, "loss": 0.0796, "step": 16530 }, { "epoch": 1.9564703099124676, "grad_norm": 2.08120059967041, "learning_rate": 1.0963544648311722e-07, "loss": 0.0798, "step": 16540 }, { "epoch": 1.9576531819257157, "grad_norm": 2.300057888031006, "learning_rate": 1.0951118552144532e-07, "loss": 0.0691, "step": 16550 }, { "epoch": 1.9588360539389638, "grad_norm": 2.2748515605926514, "learning_rate": 1.0938692455977341e-07, "loss": 0.0848, "step": 16560 }, { "epoch": 1.960018925952212, "grad_norm": 2.731955051422119, "learning_rate": 1.092626635981015e-07, "loss": 0.0768, "step": 16570 }, { "epoch": 1.9612017979654601, "grad_norm": 3.3647286891937256, "learning_rate": 1.0913840263642962e-07, "loss": 0.0726, "step": 16580 }, { "epoch": 1.9623846699787082, "grad_norm": 1.9101048707962036, "learning_rate": 1.0901414167475772e-07, "loss": 0.0787, "step": 16590 }, { "epoch": 1.9635675419919565, "grad_norm": 2.8876430988311768, "learning_rate": 1.0888988071308581e-07, "loss": 0.0779, "step": 16600 }, { "epoch": 1.9647504140052048, "grad_norm": 2.296199321746826, "learning_rate": 1.087656197514139e-07, "loss": 0.0721, "step": 16610 }, { "epoch": 1.9659332860184529, "grad_norm": 2.099191427230835, "learning_rate": 1.0864135878974202e-07, "loss": 0.073, "step": 16620 }, { "epoch": 1.967116158031701, "grad_norm": 3.850623369216919, "learning_rate": 1.0851709782807012e-07, "loss": 0.076, "step": 16630 }, { "epoch": 1.9682990300449492, "grad_norm": 2.3619115352630615, "learning_rate": 1.0839283686639821e-07, "loss": 0.0725, "step": 16640 }, { "epoch": 1.9694819020581973, "grad_norm": 2.8478941917419434, "learning_rate": 1.0826857590472632e-07, "loss": 0.0788, "step": 16650 }, { "epoch": 1.9706647740714454, "grad_norm": 2.29879093170166, "learning_rate": 1.0814431494305442e-07, "loss": 0.0756, "step": 16660 }, { "epoch": 1.9718476460846937, "grad_norm": 2.7090115547180176, "learning_rate": 1.0802005398138251e-07, "loss": 0.0727, "step": 16670 }, { "epoch": 1.973030518097942, "grad_norm": 2.783052921295166, "learning_rate": 1.0789579301971062e-07, "loss": 0.0758, "step": 16680 }, { "epoch": 1.97421339011119, "grad_norm": 2.7916996479034424, "learning_rate": 1.0777153205803871e-07, "loss": 0.0775, "step": 16690 }, { "epoch": 1.975396262124438, "grad_norm": 2.222384452819824, "learning_rate": 1.0764727109636682e-07, "loss": 0.076, "step": 16700 }, { "epoch": 1.9765791341376864, "grad_norm": 2.5768797397613525, "learning_rate": 1.0752301013469491e-07, "loss": 0.0724, "step": 16710 }, { "epoch": 1.9777620061509344, "grad_norm": 2.631540298461914, "learning_rate": 1.0739874917302302e-07, "loss": 0.0735, "step": 16720 }, { "epoch": 1.9789448781641825, "grad_norm": 2.633510112762451, "learning_rate": 1.0727448821135111e-07, "loss": 0.0736, "step": 16730 }, { "epoch": 1.9801277501774308, "grad_norm": 2.9926719665527344, "learning_rate": 1.0715022724967922e-07, "loss": 0.079, "step": 16740 }, { "epoch": 1.981310622190679, "grad_norm": 2.494245767593384, "learning_rate": 1.0702596628800733e-07, "loss": 0.0771, "step": 16750 }, { "epoch": 1.9824934942039272, "grad_norm": 2.6124095916748047, "learning_rate": 1.0690170532633542e-07, "loss": 0.0697, "step": 16760 }, { "epoch": 1.9836763662171752, "grad_norm": 2.507805347442627, "learning_rate": 1.0677744436466351e-07, "loss": 0.0732, "step": 16770 }, { "epoch": 1.9848592382304235, "grad_norm": 2.7860844135284424, "learning_rate": 1.0665318340299162e-07, "loss": 0.0726, "step": 16780 }, { "epoch": 1.9860421102436716, "grad_norm": 2.7887918949127197, "learning_rate": 1.0652892244131973e-07, "loss": 0.0822, "step": 16790 }, { "epoch": 1.9872249822569197, "grad_norm": 2.316514492034912, "learning_rate": 1.0640466147964782e-07, "loss": 0.0786, "step": 16800 }, { "epoch": 1.988407854270168, "grad_norm": 2.9376015663146973, "learning_rate": 1.0628040051797591e-07, "loss": 0.0762, "step": 16810 }, { "epoch": 1.9895907262834163, "grad_norm": 3.0005135536193848, "learning_rate": 1.06156139556304e-07, "loss": 0.0655, "step": 16820 }, { "epoch": 1.9907735982966643, "grad_norm": 2.4962329864501953, "learning_rate": 1.0603187859463212e-07, "loss": 0.0651, "step": 16830 }, { "epoch": 1.9919564703099124, "grad_norm": 2.6067328453063965, "learning_rate": 1.0590761763296022e-07, "loss": 0.0782, "step": 16840 }, { "epoch": 1.9931393423231607, "grad_norm": 2.554158926010132, "learning_rate": 1.0578335667128831e-07, "loss": 0.0739, "step": 16850 }, { "epoch": 1.9943222143364088, "grad_norm": 2.576075553894043, "learning_rate": 1.0565909570961642e-07, "loss": 0.0773, "step": 16860 }, { "epoch": 1.9955050863496568, "grad_norm": 2.8028252124786377, "learning_rate": 1.0553483474794452e-07, "loss": 0.0782, "step": 16870 }, { "epoch": 1.9966879583629051, "grad_norm": 1.8782522678375244, "learning_rate": 1.0541057378627262e-07, "loss": 0.0762, "step": 16880 }, { "epoch": 1.9978708303761534, "grad_norm": 2.524460792541504, "learning_rate": 1.0528631282460072e-07, "loss": 0.072, "step": 16890 }, { "epoch": 1.9990537023894015, "grad_norm": 2.636399984359741, "learning_rate": 1.0516205186292882e-07, "loss": 0.0745, "step": 16900 }, { "epoch": 2.0, "eval_accuracy": 0.6956449412782381, "eval_flagged/accuracy": 0.853162324915993, "eval_flagged/f1": 0.8608145823806745, "eval_flagged/precision": 0.9108382274426055, "eval_flagged/recall": 0.8159995216884398, "eval_loss": 0.07765912264585495, "eval_macro_f1": 0.6294576487775332, "eval_macro_precision": 0.7647206724526951, "eval_macro_recall": 0.5679825133242721, "eval_micro_f1": 0.7502572931795666, "eval_micro_precision": 0.8171171171171171, "eval_micro_recall": 0.6935113892988601, "eval_runtime": 86.2973, "eval_samples_per_second": 696.592, "eval_steps_per_second": 5.446, "step": 16908 }, { "epoch": 2.0002365744026496, "grad_norm": 2.0878608226776123, "learning_rate": 1.0503779090125692e-07, "loss": 0.0752, "step": 16910 }, { "epoch": 2.001419446415898, "grad_norm": 3.018571376800537, "learning_rate": 1.0491352993958502e-07, "loss": 0.0704, "step": 16920 }, { "epoch": 2.002602318429146, "grad_norm": 2.8486640453338623, "learning_rate": 1.0478926897791312e-07, "loss": 0.071, "step": 16930 }, { "epoch": 2.003785190442394, "grad_norm": 3.232513666152954, "learning_rate": 1.0466500801624122e-07, "loss": 0.0712, "step": 16940 }, { "epoch": 2.0049680624556423, "grad_norm": 4.2512526512146, "learning_rate": 1.0454074705456932e-07, "loss": 0.0757, "step": 16950 }, { "epoch": 2.0061509344688906, "grad_norm": 2.5110599994659424, "learning_rate": 1.0441648609289743e-07, "loss": 0.0715, "step": 16960 }, { "epoch": 2.0073338064821384, "grad_norm": 3.0045862197875977, "learning_rate": 1.0429222513122552e-07, "loss": 0.0749, "step": 16970 }, { "epoch": 2.0085166784953867, "grad_norm": 2.8320610523223877, "learning_rate": 1.0416796416955361e-07, "loss": 0.0703, "step": 16980 }, { "epoch": 2.009699550508635, "grad_norm": 2.5527291297912598, "learning_rate": 1.0404370320788172e-07, "loss": 0.0716, "step": 16990 }, { "epoch": 2.0108824225218833, "grad_norm": 2.867929458618164, "learning_rate": 1.0391944224620983e-07, "loss": 0.0698, "step": 17000 }, { "epoch": 2.012065294535131, "grad_norm": 2.9480843544006348, "learning_rate": 1.0379518128453792e-07, "loss": 0.0708, "step": 17010 }, { "epoch": 2.0132481665483795, "grad_norm": 2.798907518386841, "learning_rate": 1.0367092032286601e-07, "loss": 0.0669, "step": 17020 }, { "epoch": 2.0144310385616278, "grad_norm": 3.631204605102539, "learning_rate": 1.0354665936119413e-07, "loss": 0.0793, "step": 17030 }, { "epoch": 2.0156139105748756, "grad_norm": 2.3505771160125732, "learning_rate": 1.0342239839952223e-07, "loss": 0.0622, "step": 17040 }, { "epoch": 2.016796782588124, "grad_norm": 3.0960798263549805, "learning_rate": 1.0329813743785032e-07, "loss": 0.0709, "step": 17050 }, { "epoch": 2.017979654601372, "grad_norm": 3.2478065490722656, "learning_rate": 1.0317387647617841e-07, "loss": 0.0687, "step": 17060 }, { "epoch": 2.0191625266146205, "grad_norm": 2.580364465713501, "learning_rate": 1.0304961551450653e-07, "loss": 0.0617, "step": 17070 }, { "epoch": 2.0203453986278683, "grad_norm": 3.0564444065093994, "learning_rate": 1.0292535455283463e-07, "loss": 0.0708, "step": 17080 }, { "epoch": 2.0215282706411166, "grad_norm": 3.0616064071655273, "learning_rate": 1.0280109359116272e-07, "loss": 0.0628, "step": 17090 }, { "epoch": 2.022711142654365, "grad_norm": 3.306182384490967, "learning_rate": 1.0267683262949082e-07, "loss": 0.0668, "step": 17100 }, { "epoch": 2.0238940146676128, "grad_norm": 3.6199254989624023, "learning_rate": 1.0255257166781893e-07, "loss": 0.0719, "step": 17110 }, { "epoch": 2.025076886680861, "grad_norm": 2.796367645263672, "learning_rate": 1.0242831070614702e-07, "loss": 0.0671, "step": 17120 }, { "epoch": 2.0262597586941093, "grad_norm": 2.5239336490631104, "learning_rate": 1.0230404974447512e-07, "loss": 0.0658, "step": 17130 }, { "epoch": 2.0274426307073576, "grad_norm": 4.072277069091797, "learning_rate": 1.0217978878280322e-07, "loss": 0.0758, "step": 17140 }, { "epoch": 2.0286255027206055, "grad_norm": 3.4722890853881836, "learning_rate": 1.0205552782113133e-07, "loss": 0.071, "step": 17150 }, { "epoch": 2.029808374733854, "grad_norm": 2.9359891414642334, "learning_rate": 1.0193126685945942e-07, "loss": 0.0678, "step": 17160 }, { "epoch": 2.030991246747102, "grad_norm": 3.227806329727173, "learning_rate": 1.0180700589778753e-07, "loss": 0.0674, "step": 17170 }, { "epoch": 2.03217411876035, "grad_norm": 2.14829421043396, "learning_rate": 1.0168274493611562e-07, "loss": 0.0602, "step": 17180 }, { "epoch": 2.033356990773598, "grad_norm": 3.6139075756073, "learning_rate": 1.0155848397444373e-07, "loss": 0.0637, "step": 17190 }, { "epoch": 2.0345398627868465, "grad_norm": 3.230160713195801, "learning_rate": 1.0143422301277182e-07, "loss": 0.0697, "step": 17200 }, { "epoch": 2.035722734800095, "grad_norm": 3.7577931880950928, "learning_rate": 1.0130996205109993e-07, "loss": 0.0656, "step": 17210 }, { "epoch": 2.0369056068133427, "grad_norm": 3.2689623832702637, "learning_rate": 1.0118570108942802e-07, "loss": 0.0723, "step": 17220 }, { "epoch": 2.038088478826591, "grad_norm": 2.9400579929351807, "learning_rate": 1.0106144012775612e-07, "loss": 0.0724, "step": 17230 }, { "epoch": 2.0392713508398392, "grad_norm": 3.7574002742767334, "learning_rate": 1.0093717916608424e-07, "loss": 0.0765, "step": 17240 }, { "epoch": 2.040454222853087, "grad_norm": 2.809032678604126, "learning_rate": 1.0081291820441233e-07, "loss": 0.0699, "step": 17250 }, { "epoch": 2.0416370948663354, "grad_norm": 2.7370777130126953, "learning_rate": 1.0068865724274042e-07, "loss": 0.0601, "step": 17260 }, { "epoch": 2.0428199668795837, "grad_norm": 3.083723306655884, "learning_rate": 1.0056439628106851e-07, "loss": 0.0686, "step": 17270 }, { "epoch": 2.044002838892832, "grad_norm": 3.4794044494628906, "learning_rate": 1.0044013531939663e-07, "loss": 0.0763, "step": 17280 }, { "epoch": 2.04518571090608, "grad_norm": 2.823254108428955, "learning_rate": 1.0031587435772473e-07, "loss": 0.0699, "step": 17290 }, { "epoch": 2.046368582919328, "grad_norm": 3.4908294677734375, "learning_rate": 1.0019161339605282e-07, "loss": 0.0713, "step": 17300 }, { "epoch": 2.0475514549325764, "grad_norm": 4.061230182647705, "learning_rate": 1.0006735243438093e-07, "loss": 0.071, "step": 17310 }, { "epoch": 2.0487343269458242, "grad_norm": 2.9869067668914795, "learning_rate": 9.994309147270903e-08, "loss": 0.0677, "step": 17320 }, { "epoch": 2.0499171989590725, "grad_norm": 3.671929359436035, "learning_rate": 9.981883051103713e-08, "loss": 0.0793, "step": 17330 }, { "epoch": 2.051100070972321, "grad_norm": 3.499880313873291, "learning_rate": 9.969456954936522e-08, "loss": 0.0644, "step": 17340 }, { "epoch": 2.052282942985569, "grad_norm": 3.619166612625122, "learning_rate": 9.957030858769333e-08, "loss": 0.0652, "step": 17350 }, { "epoch": 2.053465814998817, "grad_norm": 3.160071611404419, "learning_rate": 9.944604762602143e-08, "loss": 0.0704, "step": 17360 }, { "epoch": 2.0546486870120653, "grad_norm": 3.7535431385040283, "learning_rate": 9.932178666434953e-08, "loss": 0.0702, "step": 17370 }, { "epoch": 2.0558315590253136, "grad_norm": 3.615499258041382, "learning_rate": 9.919752570267763e-08, "loss": 0.0725, "step": 17380 }, { "epoch": 2.057014431038562, "grad_norm": 3.3036065101623535, "learning_rate": 9.907326474100573e-08, "loss": 0.0672, "step": 17390 }, { "epoch": 2.0581973030518097, "grad_norm": 3.046010732650757, "learning_rate": 9.894900377933383e-08, "loss": 0.0635, "step": 17400 }, { "epoch": 2.059380175065058, "grad_norm": 3.3736839294433594, "learning_rate": 9.882474281766192e-08, "loss": 0.07, "step": 17410 }, { "epoch": 2.0605630470783063, "grad_norm": 3.761300563812256, "learning_rate": 9.870048185599003e-08, "loss": 0.0726, "step": 17420 }, { "epoch": 2.061745919091554, "grad_norm": 2.569359064102173, "learning_rate": 9.857622089431812e-08, "loss": 0.0738, "step": 17430 }, { "epoch": 2.0629287911048024, "grad_norm": 2.959388017654419, "learning_rate": 9.845195993264623e-08, "loss": 0.0694, "step": 17440 }, { "epoch": 2.0641116631180507, "grad_norm": 2.8931570053100586, "learning_rate": 9.832769897097434e-08, "loss": 0.0659, "step": 17450 }, { "epoch": 2.065294535131299, "grad_norm": 2.9176535606384277, "learning_rate": 9.820343800930243e-08, "loss": 0.0674, "step": 17460 }, { "epoch": 2.066477407144547, "grad_norm": 3.416747570037842, "learning_rate": 9.807917704763052e-08, "loss": 0.0692, "step": 17470 }, { "epoch": 2.067660279157795, "grad_norm": 4.427723407745361, "learning_rate": 9.795491608595864e-08, "loss": 0.0715, "step": 17480 }, { "epoch": 2.0688431511710434, "grad_norm": 2.3530871868133545, "learning_rate": 9.783065512428674e-08, "loss": 0.0682, "step": 17490 }, { "epoch": 2.0700260231842913, "grad_norm": 3.5107626914978027, "learning_rate": 9.770639416261483e-08, "loss": 0.0694, "step": 17500 }, { "epoch": 2.0712088951975396, "grad_norm": 3.4750397205352783, "learning_rate": 9.758213320094292e-08, "loss": 0.0764, "step": 17510 }, { "epoch": 2.072391767210788, "grad_norm": 2.4601211547851562, "learning_rate": 9.745787223927104e-08, "loss": 0.0639, "step": 17520 }, { "epoch": 2.073574639224036, "grad_norm": 2.868745803833008, "learning_rate": 9.733361127759914e-08, "loss": 0.0665, "step": 17530 }, { "epoch": 2.074757511237284, "grad_norm": 2.733719825744629, "learning_rate": 9.720935031592723e-08, "loss": 0.0616, "step": 17540 }, { "epoch": 2.0759403832505323, "grad_norm": 3.0722837448120117, "learning_rate": 9.708508935425532e-08, "loss": 0.0664, "step": 17550 }, { "epoch": 2.0771232552637806, "grad_norm": 4.797112941741943, "learning_rate": 9.696082839258344e-08, "loss": 0.0661, "step": 17560 }, { "epoch": 2.0783061272770285, "grad_norm": 3.875338077545166, "learning_rate": 9.683656743091153e-08, "loss": 0.0622, "step": 17570 }, { "epoch": 2.0794889992902768, "grad_norm": 3.0145435333251953, "learning_rate": 9.671230646923963e-08, "loss": 0.0749, "step": 17580 }, { "epoch": 2.080671871303525, "grad_norm": 3.0776917934417725, "learning_rate": 9.658804550756773e-08, "loss": 0.0745, "step": 17590 }, { "epoch": 2.0818547433167733, "grad_norm": 3.061025619506836, "learning_rate": 9.646378454589583e-08, "loss": 0.0629, "step": 17600 }, { "epoch": 2.083037615330021, "grad_norm": 3.3206334114074707, "learning_rate": 9.633952358422393e-08, "loss": 0.0691, "step": 17610 }, { "epoch": 2.0842204873432695, "grad_norm": 2.71479868888855, "learning_rate": 9.621526262255203e-08, "loss": 0.07, "step": 17620 }, { "epoch": 2.0854033593565178, "grad_norm": 2.6939163208007812, "learning_rate": 9.609100166088013e-08, "loss": 0.0744, "step": 17630 }, { "epoch": 2.0865862313697656, "grad_norm": 2.513003349304199, "learning_rate": 9.596674069920823e-08, "loss": 0.0753, "step": 17640 }, { "epoch": 2.087769103383014, "grad_norm": 3.194693088531494, "learning_rate": 9.584247973753633e-08, "loss": 0.0662, "step": 17650 }, { "epoch": 2.088951975396262, "grad_norm": 3.461238384246826, "learning_rate": 9.571821877586444e-08, "loss": 0.0636, "step": 17660 }, { "epoch": 2.0901348474095105, "grad_norm": 2.5343427658081055, "learning_rate": 9.559395781419253e-08, "loss": 0.0661, "step": 17670 }, { "epoch": 2.0913177194227583, "grad_norm": 3.3090105056762695, "learning_rate": 9.546969685252063e-08, "loss": 0.0715, "step": 17680 }, { "epoch": 2.0925005914360066, "grad_norm": 3.805300712585449, "learning_rate": 9.534543589084874e-08, "loss": 0.0714, "step": 17690 }, { "epoch": 2.093683463449255, "grad_norm": 3.921548843383789, "learning_rate": 9.522117492917684e-08, "loss": 0.0676, "step": 17700 }, { "epoch": 2.094866335462503, "grad_norm": 3.5764617919921875, "learning_rate": 9.509691396750493e-08, "loss": 0.0699, "step": 17710 }, { "epoch": 2.096049207475751, "grad_norm": 2.381481885910034, "learning_rate": 9.497265300583302e-08, "loss": 0.0607, "step": 17720 }, { "epoch": 2.0972320794889994, "grad_norm": 2.6727700233459473, "learning_rate": 9.484839204416114e-08, "loss": 0.0748, "step": 17730 }, { "epoch": 2.0984149515022477, "grad_norm": 3.6301305294036865, "learning_rate": 9.472413108248924e-08, "loss": 0.0778, "step": 17740 }, { "epoch": 2.0995978235154955, "grad_norm": 3.570549964904785, "learning_rate": 9.459987012081733e-08, "loss": 0.0674, "step": 17750 }, { "epoch": 2.100780695528744, "grad_norm": 3.5768401622772217, "learning_rate": 9.447560915914542e-08, "loss": 0.0699, "step": 17760 }, { "epoch": 2.101963567541992, "grad_norm": 2.454476833343506, "learning_rate": 9.435134819747354e-08, "loss": 0.066, "step": 17770 }, { "epoch": 2.10314643955524, "grad_norm": 3.0337021350860596, "learning_rate": 9.422708723580164e-08, "loss": 0.0714, "step": 17780 }, { "epoch": 2.1043293115684882, "grad_norm": 3.2996010780334473, "learning_rate": 9.410282627412973e-08, "loss": 0.0712, "step": 17790 }, { "epoch": 2.1055121835817365, "grad_norm": 3.025628089904785, "learning_rate": 9.397856531245784e-08, "loss": 0.0681, "step": 17800 }, { "epoch": 2.106695055594985, "grad_norm": 3.275552272796631, "learning_rate": 9.385430435078594e-08, "loss": 0.0682, "step": 17810 }, { "epoch": 2.1078779276082327, "grad_norm": 2.804396152496338, "learning_rate": 9.373004338911404e-08, "loss": 0.0698, "step": 17820 }, { "epoch": 2.109060799621481, "grad_norm": 3.186006784439087, "learning_rate": 9.360578242744213e-08, "loss": 0.0795, "step": 17830 }, { "epoch": 2.1102436716347293, "grad_norm": 3.2354249954223633, "learning_rate": 9.348152146577023e-08, "loss": 0.0707, "step": 17840 }, { "epoch": 2.111426543647977, "grad_norm": 2.376523494720459, "learning_rate": 9.335726050409834e-08, "loss": 0.0718, "step": 17850 }, { "epoch": 2.1126094156612254, "grad_norm": 4.070720672607422, "learning_rate": 9.323299954242643e-08, "loss": 0.0626, "step": 17860 }, { "epoch": 2.1137922876744737, "grad_norm": 2.1912505626678467, "learning_rate": 9.310873858075454e-08, "loss": 0.067, "step": 17870 }, { "epoch": 2.114975159687722, "grad_norm": 2.9776341915130615, "learning_rate": 9.298447761908263e-08, "loss": 0.0707, "step": 17880 }, { "epoch": 2.11615803170097, "grad_norm": 3.1587605476379395, "learning_rate": 9.286021665741074e-08, "loss": 0.0701, "step": 17890 }, { "epoch": 2.117340903714218, "grad_norm": 3.3989832401275635, "learning_rate": 9.273595569573885e-08, "loss": 0.0649, "step": 17900 }, { "epoch": 2.1185237757274664, "grad_norm": 3.5216143131256104, "learning_rate": 9.261169473406694e-08, "loss": 0.0708, "step": 17910 }, { "epoch": 2.1197066477407143, "grad_norm": 3.5039315223693848, "learning_rate": 9.248743377239503e-08, "loss": 0.0742, "step": 17920 }, { "epoch": 2.1208895197539626, "grad_norm": 3.0199732780456543, "learning_rate": 9.236317281072314e-08, "loss": 0.0695, "step": 17930 }, { "epoch": 2.122072391767211, "grad_norm": 4.005902290344238, "learning_rate": 9.223891184905125e-08, "loss": 0.0669, "step": 17940 }, { "epoch": 2.123255263780459, "grad_norm": 4.3672051429748535, "learning_rate": 9.211465088737934e-08, "loss": 0.0725, "step": 17950 }, { "epoch": 2.124438135793707, "grad_norm": 2.454200267791748, "learning_rate": 9.199038992570743e-08, "loss": 0.0669, "step": 17960 }, { "epoch": 2.1256210078069553, "grad_norm": 2.3986425399780273, "learning_rate": 9.186612896403555e-08, "loss": 0.0619, "step": 17970 }, { "epoch": 2.1268038798202036, "grad_norm": 2.3777952194213867, "learning_rate": 9.174186800236364e-08, "loss": 0.0672, "step": 17980 }, { "epoch": 2.1279867518334514, "grad_norm": 2.9773848056793213, "learning_rate": 9.161760704069174e-08, "loss": 0.0705, "step": 17990 }, { "epoch": 2.1291696238466997, "grad_norm": 3.4781293869018555, "learning_rate": 9.149334607901983e-08, "loss": 0.0696, "step": 18000 }, { "epoch": 2.130352495859948, "grad_norm": 2.515350818634033, "learning_rate": 9.136908511734794e-08, "loss": 0.0647, "step": 18010 }, { "epoch": 2.1315353678731963, "grad_norm": 3.4378321170806885, "learning_rate": 9.124482415567604e-08, "loss": 0.0728, "step": 18020 }, { "epoch": 2.132718239886444, "grad_norm": 2.412824869155884, "learning_rate": 9.112056319400414e-08, "loss": 0.0661, "step": 18030 }, { "epoch": 2.1339011118996924, "grad_norm": 3.5548126697540283, "learning_rate": 9.099630223233223e-08, "loss": 0.0724, "step": 18040 }, { "epoch": 2.1350839839129407, "grad_norm": 3.702521800994873, "learning_rate": 9.087204127066034e-08, "loss": 0.0714, "step": 18050 }, { "epoch": 2.1362668559261886, "grad_norm": 5.137021541595459, "learning_rate": 9.074778030898844e-08, "loss": 0.0734, "step": 18060 }, { "epoch": 2.137449727939437, "grad_norm": 3.5286521911621094, "learning_rate": 9.062351934731654e-08, "loss": 0.0774, "step": 18070 }, { "epoch": 2.138632599952685, "grad_norm": 2.7364888191223145, "learning_rate": 9.049925838564464e-08, "loss": 0.0784, "step": 18080 }, { "epoch": 2.1398154719659335, "grad_norm": 3.3668947219848633, "learning_rate": 9.037499742397274e-08, "loss": 0.0683, "step": 18090 }, { "epoch": 2.1409983439791813, "grad_norm": 4.052450180053711, "learning_rate": 9.025073646230084e-08, "loss": 0.0731, "step": 18100 }, { "epoch": 2.1421812159924296, "grad_norm": 3.612034559249878, "learning_rate": 9.012647550062895e-08, "loss": 0.0707, "step": 18110 }, { "epoch": 2.143364088005678, "grad_norm": 2.8719542026519775, "learning_rate": 9.000221453895704e-08, "loss": 0.0692, "step": 18120 }, { "epoch": 2.1445469600189258, "grad_norm": 3.0098962783813477, "learning_rate": 8.987795357728513e-08, "loss": 0.0786, "step": 18130 }, { "epoch": 2.145729832032174, "grad_norm": 3.1535134315490723, "learning_rate": 8.975369261561324e-08, "loss": 0.0641, "step": 18140 }, { "epoch": 2.1469127040454223, "grad_norm": 3.838127374649048, "learning_rate": 8.962943165394135e-08, "loss": 0.0735, "step": 18150 }, { "epoch": 2.1480955760586706, "grad_norm": 3.329160690307617, "learning_rate": 8.950517069226944e-08, "loss": 0.0707, "step": 18160 }, { "epoch": 2.1492784480719185, "grad_norm": 3.2489681243896484, "learning_rate": 8.938090973059753e-08, "loss": 0.0694, "step": 18170 }, { "epoch": 2.1504613200851668, "grad_norm": 3.5367112159729004, "learning_rate": 8.925664876892565e-08, "loss": 0.0739, "step": 18180 }, { "epoch": 2.151644192098415, "grad_norm": 3.335631847381592, "learning_rate": 8.913238780725375e-08, "loss": 0.0721, "step": 18190 }, { "epoch": 2.152827064111663, "grad_norm": 3.6521618366241455, "learning_rate": 8.900812684558184e-08, "loss": 0.0723, "step": 18200 }, { "epoch": 2.154009936124911, "grad_norm": 3.094815969467163, "learning_rate": 8.888386588390993e-08, "loss": 0.0629, "step": 18210 }, { "epoch": 2.1551928081381595, "grad_norm": 3.406586170196533, "learning_rate": 8.875960492223805e-08, "loss": 0.0638, "step": 18220 }, { "epoch": 2.156375680151408, "grad_norm": 2.437417984008789, "learning_rate": 8.863534396056615e-08, "loss": 0.065, "step": 18230 }, { "epoch": 2.1575585521646556, "grad_norm": 3.3551313877105713, "learning_rate": 8.851108299889424e-08, "loss": 0.0732, "step": 18240 }, { "epoch": 2.158741424177904, "grad_norm": 2.430096387863159, "learning_rate": 8.838682203722233e-08, "loss": 0.0669, "step": 18250 }, { "epoch": 2.1599242961911522, "grad_norm": 3.75054669380188, "learning_rate": 8.826256107555045e-08, "loss": 0.0653, "step": 18260 }, { "epoch": 2.1611071682044, "grad_norm": 4.0450968742370605, "learning_rate": 8.813830011387855e-08, "loss": 0.069, "step": 18270 }, { "epoch": 2.1622900402176484, "grad_norm": 3.468480110168457, "learning_rate": 8.801403915220664e-08, "loss": 0.0675, "step": 18280 }, { "epoch": 2.1634729122308967, "grad_norm": 2.8435239791870117, "learning_rate": 8.788977819053474e-08, "loss": 0.074, "step": 18290 }, { "epoch": 2.164655784244145, "grad_norm": 3.4733200073242188, "learning_rate": 8.776551722886285e-08, "loss": 0.061, "step": 18300 }, { "epoch": 2.165838656257393, "grad_norm": 3.4718265533447266, "learning_rate": 8.764125626719094e-08, "loss": 0.0703, "step": 18310 }, { "epoch": 2.167021528270641, "grad_norm": 35.06625747680664, "learning_rate": 8.751699530551905e-08, "loss": 0.0712, "step": 18320 }, { "epoch": 2.1682044002838894, "grad_norm": 2.596001625061035, "learning_rate": 8.739273434384714e-08, "loss": 0.0633, "step": 18330 }, { "epoch": 2.1693872722971372, "grad_norm": 2.468822717666626, "learning_rate": 8.726847338217525e-08, "loss": 0.0672, "step": 18340 }, { "epoch": 2.1705701443103855, "grad_norm": 3.339338779449463, "learning_rate": 8.714421242050334e-08, "loss": 0.0664, "step": 18350 }, { "epoch": 2.171753016323634, "grad_norm": 3.6640405654907227, "learning_rate": 8.701995145883145e-08, "loss": 0.0816, "step": 18360 }, { "epoch": 2.172935888336882, "grad_norm": 2.3052778244018555, "learning_rate": 8.689569049715954e-08, "loss": 0.0694, "step": 18370 }, { "epoch": 2.17411876035013, "grad_norm": 2.8331186771392822, "learning_rate": 8.677142953548764e-08, "loss": 0.0664, "step": 18380 }, { "epoch": 2.1753016323633783, "grad_norm": 2.784214973449707, "learning_rate": 8.664716857381576e-08, "loss": 0.0716, "step": 18390 }, { "epoch": 2.1764845043766265, "grad_norm": 3.5692355632781982, "learning_rate": 8.652290761214385e-08, "loss": 0.0683, "step": 18400 }, { "epoch": 2.1776673763898744, "grad_norm": 3.1544697284698486, "learning_rate": 8.639864665047194e-08, "loss": 0.062, "step": 18410 }, { "epoch": 2.1788502484031227, "grad_norm": 2.3756279945373535, "learning_rate": 8.627438568880004e-08, "loss": 0.0673, "step": 18420 }, { "epoch": 2.180033120416371, "grad_norm": 2.266930341720581, "learning_rate": 8.615012472712815e-08, "loss": 0.0653, "step": 18430 }, { "epoch": 2.1812159924296193, "grad_norm": 3.1127898693084717, "learning_rate": 8.602586376545625e-08, "loss": 0.0643, "step": 18440 }, { "epoch": 2.182398864442867, "grad_norm": 3.303499221801758, "learning_rate": 8.590160280378434e-08, "loss": 0.0783, "step": 18450 }, { "epoch": 2.1835817364561154, "grad_norm": 3.4634835720062256, "learning_rate": 8.577734184211243e-08, "loss": 0.0701, "step": 18460 }, { "epoch": 2.1847646084693637, "grad_norm": 2.3026974201202393, "learning_rate": 8.565308088044055e-08, "loss": 0.0624, "step": 18470 }, { "epoch": 2.1859474804826116, "grad_norm": 2.12914776802063, "learning_rate": 8.552881991876865e-08, "loss": 0.062, "step": 18480 }, { "epoch": 2.18713035249586, "grad_norm": 2.7069342136383057, "learning_rate": 8.540455895709674e-08, "loss": 0.0682, "step": 18490 }, { "epoch": 2.188313224509108, "grad_norm": 3.0162222385406494, "learning_rate": 8.528029799542485e-08, "loss": 0.0647, "step": 18500 }, { "epoch": 2.1894960965223564, "grad_norm": 3.554150342941284, "learning_rate": 8.515603703375295e-08, "loss": 0.0696, "step": 18510 }, { "epoch": 2.1906789685356043, "grad_norm": 3.399521589279175, "learning_rate": 8.503177607208105e-08, "loss": 0.0652, "step": 18520 }, { "epoch": 2.1918618405488526, "grad_norm": 3.1130640506744385, "learning_rate": 8.490751511040915e-08, "loss": 0.0708, "step": 18530 }, { "epoch": 2.193044712562101, "grad_norm": 3.1085402965545654, "learning_rate": 8.478325414873725e-08, "loss": 0.0617, "step": 18540 }, { "epoch": 2.1942275845753487, "grad_norm": 2.9471395015716553, "learning_rate": 8.465899318706535e-08, "loss": 0.0662, "step": 18550 }, { "epoch": 2.195410456588597, "grad_norm": 3.8744702339172363, "learning_rate": 8.453473222539345e-08, "loss": 0.067, "step": 18560 }, { "epoch": 2.1965933286018453, "grad_norm": 2.973573684692383, "learning_rate": 8.441047126372155e-08, "loss": 0.0686, "step": 18570 }, { "epoch": 2.1977762006150936, "grad_norm": 4.832242012023926, "learning_rate": 8.428621030204964e-08, "loss": 0.0801, "step": 18580 }, { "epoch": 2.1989590726283414, "grad_norm": 4.349803447723389, "learning_rate": 8.416194934037775e-08, "loss": 0.0646, "step": 18590 }, { "epoch": 2.2001419446415897, "grad_norm": 3.4971022605895996, "learning_rate": 8.403768837870586e-08, "loss": 0.0646, "step": 18600 }, { "epoch": 2.201324816654838, "grad_norm": 3.3544631004333496, "learning_rate": 8.391342741703395e-08, "loss": 0.067, "step": 18610 }, { "epoch": 2.202507688668086, "grad_norm": 2.560032606124878, "learning_rate": 8.378916645536204e-08, "loss": 0.0635, "step": 18620 }, { "epoch": 2.203690560681334, "grad_norm": 3.170720338821411, "learning_rate": 8.366490549369016e-08, "loss": 0.0709, "step": 18630 }, { "epoch": 2.2048734326945825, "grad_norm": 3.473205089569092, "learning_rate": 8.354064453201826e-08, "loss": 0.071, "step": 18640 }, { "epoch": 2.2060563047078308, "grad_norm": 3.3660693168640137, "learning_rate": 8.341638357034635e-08, "loss": 0.0648, "step": 18650 }, { "epoch": 2.2072391767210786, "grad_norm": 4.10633659362793, "learning_rate": 8.329212260867444e-08, "loss": 0.0686, "step": 18660 }, { "epoch": 2.208422048734327, "grad_norm": 2.6736738681793213, "learning_rate": 8.316786164700256e-08, "loss": 0.0657, "step": 18670 }, { "epoch": 2.209604920747575, "grad_norm": 3.4040610790252686, "learning_rate": 8.304360068533066e-08, "loss": 0.0656, "step": 18680 }, { "epoch": 2.210787792760823, "grad_norm": 4.619311809539795, "learning_rate": 8.291933972365875e-08, "loss": 0.0652, "step": 18690 }, { "epoch": 2.2119706647740713, "grad_norm": 3.1844847202301025, "learning_rate": 8.279507876198684e-08, "loss": 0.0719, "step": 18700 }, { "epoch": 2.2131535367873196, "grad_norm": 3.6539852619171143, "learning_rate": 8.267081780031496e-08, "loss": 0.075, "step": 18710 }, { "epoch": 2.214336408800568, "grad_norm": 2.796314239501953, "learning_rate": 8.254655683864305e-08, "loss": 0.0657, "step": 18720 }, { "epoch": 2.2155192808138158, "grad_norm": 2.885733127593994, "learning_rate": 8.242229587697115e-08, "loss": 0.0747, "step": 18730 }, { "epoch": 2.216702152827064, "grad_norm": 3.258451223373413, "learning_rate": 8.229803491529925e-08, "loss": 0.073, "step": 18740 }, { "epoch": 2.2178850248403124, "grad_norm": 2.8602120876312256, "learning_rate": 8.217377395362736e-08, "loss": 0.0654, "step": 18750 }, { "epoch": 2.2190678968535607, "grad_norm": 3.282493829727173, "learning_rate": 8.204951299195545e-08, "loss": 0.0652, "step": 18760 }, { "epoch": 2.2202507688668085, "grad_norm": 2.9386353492736816, "learning_rate": 8.192525203028355e-08, "loss": 0.068, "step": 18770 }, { "epoch": 2.221433640880057, "grad_norm": 3.175847053527832, "learning_rate": 8.180099106861165e-08, "loss": 0.07, "step": 18780 }, { "epoch": 2.222616512893305, "grad_norm": 2.453228235244751, "learning_rate": 8.167673010693975e-08, "loss": 0.073, "step": 18790 }, { "epoch": 2.223799384906553, "grad_norm": 3.1168158054351807, "learning_rate": 8.155246914526785e-08, "loss": 0.062, "step": 18800 }, { "epoch": 2.2249822569198012, "grad_norm": 3.603816032409668, "learning_rate": 8.142820818359596e-08, "loss": 0.0697, "step": 18810 }, { "epoch": 2.2261651289330495, "grad_norm": 3.2606444358825684, "learning_rate": 8.130394722192405e-08, "loss": 0.0672, "step": 18820 }, { "epoch": 2.227348000946298, "grad_norm": 3.285932779312134, "learning_rate": 8.117968626025215e-08, "loss": 0.0725, "step": 18830 }, { "epoch": 2.2285308729595457, "grad_norm": 3.1257176399230957, "learning_rate": 8.105542529858027e-08, "loss": 0.0737, "step": 18840 }, { "epoch": 2.229713744972794, "grad_norm": 3.5277602672576904, "learning_rate": 8.093116433690836e-08, "loss": 0.0697, "step": 18850 }, { "epoch": 2.2308966169860422, "grad_norm": 3.222766876220703, "learning_rate": 8.080690337523645e-08, "loss": 0.0675, "step": 18860 }, { "epoch": 2.23207948899929, "grad_norm": 2.7133383750915527, "learning_rate": 8.068264241356454e-08, "loss": 0.0688, "step": 18870 }, { "epoch": 2.2332623610125384, "grad_norm": 3.409433126449585, "learning_rate": 8.055838145189266e-08, "loss": 0.0748, "step": 18880 }, { "epoch": 2.2344452330257867, "grad_norm": 3.106909990310669, "learning_rate": 8.043412049022076e-08, "loss": 0.0733, "step": 18890 }, { "epoch": 2.235628105039035, "grad_norm": 2.956697940826416, "learning_rate": 8.030985952854885e-08, "loss": 0.0632, "step": 18900 }, { "epoch": 2.236810977052283, "grad_norm": 4.019081115722656, "learning_rate": 8.018559856687694e-08, "loss": 0.0663, "step": 18910 }, { "epoch": 2.237993849065531, "grad_norm": 3.419271469116211, "learning_rate": 8.006133760520506e-08, "loss": 0.071, "step": 18920 }, { "epoch": 2.2391767210787794, "grad_norm": 3.9245216846466064, "learning_rate": 7.993707664353316e-08, "loss": 0.0715, "step": 18930 }, { "epoch": 2.2403595930920273, "grad_norm": 2.9944989681243896, "learning_rate": 7.981281568186125e-08, "loss": 0.0675, "step": 18940 }, { "epoch": 2.2415424651052756, "grad_norm": 3.624729871749878, "learning_rate": 7.968855472018936e-08, "loss": 0.0604, "step": 18950 }, { "epoch": 2.242725337118524, "grad_norm": 3.574862003326416, "learning_rate": 7.956429375851746e-08, "loss": 0.0666, "step": 18960 }, { "epoch": 2.243908209131772, "grad_norm": 2.3265790939331055, "learning_rate": 7.944003279684556e-08, "loss": 0.0739, "step": 18970 }, { "epoch": 2.24509108114502, "grad_norm": 2.5500175952911377, "learning_rate": 7.931577183517365e-08, "loss": 0.0674, "step": 18980 }, { "epoch": 2.2462739531582683, "grad_norm": 3.230897903442383, "learning_rate": 7.919151087350176e-08, "loss": 0.07, "step": 18990 }, { "epoch": 2.2474568251715166, "grad_norm": 3.004723310470581, "learning_rate": 7.906724991182986e-08, "loss": 0.075, "step": 19000 }, { "epoch": 2.2486396971847644, "grad_norm": 3.0550377368927, "learning_rate": 7.894298895015795e-08, "loss": 0.065, "step": 19010 }, { "epoch": 2.2498225691980127, "grad_norm": 2.4709744453430176, "learning_rate": 7.881872798848606e-08, "loss": 0.0717, "step": 19020 }, { "epoch": 2.251005441211261, "grad_norm": 2.3858835697174072, "learning_rate": 7.869446702681415e-08, "loss": 0.0638, "step": 19030 }, { "epoch": 2.2521883132245093, "grad_norm": 2.9313271045684814, "learning_rate": 7.857020606514226e-08, "loss": 0.0697, "step": 19040 }, { "epoch": 2.253371185237757, "grad_norm": 2.627134323120117, "learning_rate": 7.844594510347037e-08, "loss": 0.0655, "step": 19050 }, { "epoch": 2.2545540572510054, "grad_norm": 4.033355236053467, "learning_rate": 7.832168414179846e-08, "loss": 0.0686, "step": 19060 }, { "epoch": 2.2557369292642537, "grad_norm": 2.5268309116363525, "learning_rate": 7.819742318012655e-08, "loss": 0.0658, "step": 19070 }, { "epoch": 2.2569198012775016, "grad_norm": 1.9077225923538208, "learning_rate": 7.807316221845466e-08, "loss": 0.0708, "step": 19080 }, { "epoch": 2.25810267329075, "grad_norm": 3.635075807571411, "learning_rate": 7.794890125678277e-08, "loss": 0.0712, "step": 19090 }, { "epoch": 2.259285545303998, "grad_norm": 3.852567434310913, "learning_rate": 7.782464029511086e-08, "loss": 0.0705, "step": 19100 }, { "epoch": 2.2604684173172465, "grad_norm": 3.443063497543335, "learning_rate": 7.770037933343895e-08, "loss": 0.0734, "step": 19110 }, { "epoch": 2.2616512893304943, "grad_norm": 2.8455803394317627, "learning_rate": 7.757611837176707e-08, "loss": 0.0613, "step": 19120 }, { "epoch": 2.2628341613437426, "grad_norm": 2.6028025150299072, "learning_rate": 7.745185741009517e-08, "loss": 0.0639, "step": 19130 }, { "epoch": 2.264017033356991, "grad_norm": 2.860522508621216, "learning_rate": 7.732759644842326e-08, "loss": 0.0698, "step": 19140 }, { "epoch": 2.265199905370239, "grad_norm": 3.776965856552124, "learning_rate": 7.720333548675135e-08, "loss": 0.0737, "step": 19150 }, { "epoch": 2.266382777383487, "grad_norm": 2.9697165489196777, "learning_rate": 7.707907452507946e-08, "loss": 0.0681, "step": 19160 }, { "epoch": 2.2675656493967353, "grad_norm": 2.9209208488464355, "learning_rate": 7.695481356340756e-08, "loss": 0.0754, "step": 19170 }, { "epoch": 2.2687485214099836, "grad_norm": 3.1651949882507324, "learning_rate": 7.683055260173566e-08, "loss": 0.0747, "step": 19180 }, { "epoch": 2.2699313934232315, "grad_norm": 2.5308327674865723, "learning_rate": 7.670629164006375e-08, "loss": 0.0702, "step": 19190 }, { "epoch": 2.2711142654364798, "grad_norm": 3.2780795097351074, "learning_rate": 7.658203067839186e-08, "loss": 0.0684, "step": 19200 }, { "epoch": 2.272297137449728, "grad_norm": 4.134498119354248, "learning_rate": 7.645776971671996e-08, "loss": 0.0675, "step": 19210 }, { "epoch": 2.2734800094629763, "grad_norm": 3.3921051025390625, "learning_rate": 7.633350875504806e-08, "loss": 0.0684, "step": 19220 }, { "epoch": 2.274662881476224, "grad_norm": 3.6295573711395264, "learning_rate": 7.620924779337616e-08, "loss": 0.076, "step": 19230 }, { "epoch": 2.2758457534894725, "grad_norm": 2.6500871181488037, "learning_rate": 7.608498683170426e-08, "loss": 0.0722, "step": 19240 }, { "epoch": 2.277028625502721, "grad_norm": 3.189622402191162, "learning_rate": 7.596072587003236e-08, "loss": 0.07, "step": 19250 }, { "epoch": 2.2782114975159686, "grad_norm": 2.8819291591644287, "learning_rate": 7.583646490836047e-08, "loss": 0.0682, "step": 19260 }, { "epoch": 2.279394369529217, "grad_norm": 3.149909257888794, "learning_rate": 7.571220394668856e-08, "loss": 0.0717, "step": 19270 }, { "epoch": 2.280577241542465, "grad_norm": 2.870091199874878, "learning_rate": 7.558794298501666e-08, "loss": 0.0784, "step": 19280 }, { "epoch": 2.2817601135557135, "grad_norm": 2.7320971488952637, "learning_rate": 7.546368202334476e-08, "loss": 0.0719, "step": 19290 }, { "epoch": 2.2829429855689614, "grad_norm": 3.091562032699585, "learning_rate": 7.533942106167287e-08, "loss": 0.0733, "step": 19300 }, { "epoch": 2.2841258575822097, "grad_norm": 4.2651567459106445, "learning_rate": 7.521516010000096e-08, "loss": 0.0687, "step": 19310 }, { "epoch": 2.285308729595458, "grad_norm": 3.206876277923584, "learning_rate": 7.509089913832905e-08, "loss": 0.0676, "step": 19320 }, { "epoch": 2.286491601608706, "grad_norm": 2.990051507949829, "learning_rate": 7.496663817665717e-08, "loss": 0.0767, "step": 19330 }, { "epoch": 2.287674473621954, "grad_norm": 3.273642063140869, "learning_rate": 7.484237721498527e-08, "loss": 0.0716, "step": 19340 }, { "epoch": 2.2888573456352024, "grad_norm": 3.3968088626861572, "learning_rate": 7.471811625331336e-08, "loss": 0.0608, "step": 19350 }, { "epoch": 2.2900402176484507, "grad_norm": 3.8100767135620117, "learning_rate": 7.459385529164145e-08, "loss": 0.074, "step": 19360 }, { "epoch": 2.2912230896616985, "grad_norm": 2.632089376449585, "learning_rate": 7.446959432996957e-08, "loss": 0.0642, "step": 19370 }, { "epoch": 2.292405961674947, "grad_norm": 2.77629017829895, "learning_rate": 7.434533336829767e-08, "loss": 0.0658, "step": 19380 }, { "epoch": 2.293588833688195, "grad_norm": 2.9146740436553955, "learning_rate": 7.422107240662576e-08, "loss": 0.0698, "step": 19390 }, { "epoch": 2.294771705701443, "grad_norm": 2.467982053756714, "learning_rate": 7.409681144495385e-08, "loss": 0.0699, "step": 19400 }, { "epoch": 2.2959545777146912, "grad_norm": 5.146457672119141, "learning_rate": 7.397255048328197e-08, "loss": 0.0691, "step": 19410 }, { "epoch": 2.2971374497279395, "grad_norm": 3.2811567783355713, "learning_rate": 7.384828952161007e-08, "loss": 0.0655, "step": 19420 }, { "epoch": 2.298320321741188, "grad_norm": 2.885089159011841, "learning_rate": 7.372402855993816e-08, "loss": 0.0731, "step": 19430 }, { "epoch": 2.2995031937544357, "grad_norm": 3.216449737548828, "learning_rate": 7.359976759826627e-08, "loss": 0.0704, "step": 19440 }, { "epoch": 2.300686065767684, "grad_norm": 2.8578031063079834, "learning_rate": 7.347550663659437e-08, "loss": 0.0698, "step": 19450 }, { "epoch": 2.3018689377809323, "grad_norm": 3.651703357696533, "learning_rate": 7.335124567492246e-08, "loss": 0.0681, "step": 19460 }, { "epoch": 2.30305180979418, "grad_norm": 3.350461006164551, "learning_rate": 7.322698471325057e-08, "loss": 0.074, "step": 19470 }, { "epoch": 2.3042346818074284, "grad_norm": 2.9606430530548096, "learning_rate": 7.310272375157866e-08, "loss": 0.0753, "step": 19480 }, { "epoch": 2.3054175538206767, "grad_norm": 2.6892459392547607, "learning_rate": 7.297846278990677e-08, "loss": 0.0708, "step": 19490 }, { "epoch": 2.306600425833925, "grad_norm": 2.8564746379852295, "learning_rate": 7.285420182823486e-08, "loss": 0.0729, "step": 19500 }, { "epoch": 2.307783297847173, "grad_norm": 3.3690414428710938, "learning_rate": 7.272994086656297e-08, "loss": 0.073, "step": 19510 }, { "epoch": 2.308966169860421, "grad_norm": 2.3167338371276855, "learning_rate": 7.260567990489106e-08, "loss": 0.0665, "step": 19520 }, { "epoch": 2.3101490418736694, "grad_norm": 3.1788861751556396, "learning_rate": 7.248141894321917e-08, "loss": 0.0692, "step": 19530 }, { "epoch": 2.3113319138869173, "grad_norm": 3.375000238418579, "learning_rate": 7.235715798154728e-08, "loss": 0.069, "step": 19540 }, { "epoch": 2.3125147859001656, "grad_norm": 4.530477046966553, "learning_rate": 7.223289701987537e-08, "loss": 0.0664, "step": 19550 }, { "epoch": 2.313697657913414, "grad_norm": 3.7293052673339844, "learning_rate": 7.210863605820346e-08, "loss": 0.0727, "step": 19560 }, { "epoch": 2.314880529926662, "grad_norm": 3.1132874488830566, "learning_rate": 7.198437509653156e-08, "loss": 0.0718, "step": 19570 }, { "epoch": 2.31606340193991, "grad_norm": 3.279447078704834, "learning_rate": 7.186011413485968e-08, "loss": 0.0699, "step": 19580 }, { "epoch": 2.3172462739531583, "grad_norm": 2.3804566860198975, "learning_rate": 7.173585317318777e-08, "loss": 0.0709, "step": 19590 }, { "epoch": 2.3184291459664066, "grad_norm": 3.6008141040802, "learning_rate": 7.161159221151586e-08, "loss": 0.0658, "step": 19600 }, { "epoch": 2.3196120179796544, "grad_norm": 4.050093650817871, "learning_rate": 7.148733124984395e-08, "loss": 0.0628, "step": 19610 }, { "epoch": 2.3207948899929027, "grad_norm": 3.2886440753936768, "learning_rate": 7.136307028817207e-08, "loss": 0.0644, "step": 19620 }, { "epoch": 2.321977762006151, "grad_norm": 3.266967535018921, "learning_rate": 7.123880932650017e-08, "loss": 0.0706, "step": 19630 }, { "epoch": 2.3231606340193993, "grad_norm": 3.406386613845825, "learning_rate": 7.111454836482826e-08, "loss": 0.0681, "step": 19640 }, { "epoch": 2.324343506032647, "grad_norm": 2.9610936641693115, "learning_rate": 7.099028740315637e-08, "loss": 0.063, "step": 19650 }, { "epoch": 2.3255263780458955, "grad_norm": 3.8344335556030273, "learning_rate": 7.086602644148446e-08, "loss": 0.0706, "step": 19660 }, { "epoch": 2.3267092500591438, "grad_norm": 3.305325984954834, "learning_rate": 7.074176547981257e-08, "loss": 0.0642, "step": 19670 }, { "epoch": 2.3278921220723916, "grad_norm": 4.520902156829834, "learning_rate": 7.061750451814067e-08, "loss": 0.0674, "step": 19680 }, { "epoch": 2.32907499408564, "grad_norm": 3.4153285026550293, "learning_rate": 7.049324355646877e-08, "loss": 0.0682, "step": 19690 }, { "epoch": 2.330257866098888, "grad_norm": 2.704911947250366, "learning_rate": 7.036898259479687e-08, "loss": 0.0621, "step": 19700 }, { "epoch": 2.3314407381121365, "grad_norm": 3.5278639793395996, "learning_rate": 7.024472163312497e-08, "loss": 0.0676, "step": 19710 }, { "epoch": 2.3326236101253843, "grad_norm": 3.620110273361206, "learning_rate": 7.012046067145307e-08, "loss": 0.0695, "step": 19720 }, { "epoch": 2.3338064821386326, "grad_norm": 2.489788055419922, "learning_rate": 6.999619970978117e-08, "loss": 0.068, "step": 19730 }, { "epoch": 2.334989354151881, "grad_norm": 3.4601380825042725, "learning_rate": 6.987193874810927e-08, "loss": 0.0765, "step": 19740 }, { "epoch": 2.3361722261651288, "grad_norm": 4.999173641204834, "learning_rate": 6.974767778643738e-08, "loss": 0.0745, "step": 19750 }, { "epoch": 2.337355098178377, "grad_norm": 2.7591781616210938, "learning_rate": 6.962341682476547e-08, "loss": 0.0651, "step": 19760 }, { "epoch": 2.3385379701916253, "grad_norm": 2.990304470062256, "learning_rate": 6.949915586309358e-08, "loss": 0.0585, "step": 19770 }, { "epoch": 2.3397208422048736, "grad_norm": 3.0400054454803467, "learning_rate": 6.937489490142167e-08, "loss": 0.0738, "step": 19780 }, { "epoch": 2.3409037142181215, "grad_norm": 3.1441431045532227, "learning_rate": 6.925063393974978e-08, "loss": 0.0726, "step": 19790 }, { "epoch": 2.34208658623137, "grad_norm": 3.5926241874694824, "learning_rate": 6.912637297807787e-08, "loss": 0.0692, "step": 19800 }, { "epoch": 2.343269458244618, "grad_norm": 3.3305296897888184, "learning_rate": 6.900211201640598e-08, "loss": 0.0651, "step": 19810 }, { "epoch": 2.344452330257866, "grad_norm": 3.576585531234741, "learning_rate": 6.887785105473407e-08, "loss": 0.0651, "step": 19820 }, { "epoch": 2.345635202271114, "grad_norm": 2.8127026557922363, "learning_rate": 6.875359009306218e-08, "loss": 0.0727, "step": 19830 }, { "epoch": 2.3468180742843625, "grad_norm": 2.9238433837890625, "learning_rate": 6.862932913139027e-08, "loss": 0.0676, "step": 19840 }, { "epoch": 2.348000946297611, "grad_norm": 3.1682302951812744, "learning_rate": 6.850506816971838e-08, "loss": 0.0659, "step": 19850 }, { "epoch": 2.3491838183108587, "grad_norm": 3.837569236755371, "learning_rate": 6.838080720804647e-08, "loss": 0.072, "step": 19860 }, { "epoch": 2.350366690324107, "grad_norm": 2.371891498565674, "learning_rate": 6.825654624637458e-08, "loss": 0.0668, "step": 19870 }, { "epoch": 2.3515495623373552, "grad_norm": 3.0820627212524414, "learning_rate": 6.813228528470267e-08, "loss": 0.0726, "step": 19880 }, { "epoch": 2.352732434350603, "grad_norm": 4.318417072296143, "learning_rate": 6.800802432303077e-08, "loss": 0.0728, "step": 19890 }, { "epoch": 2.3539153063638514, "grad_norm": 3.5189402103424072, "learning_rate": 6.788376336135887e-08, "loss": 0.0742, "step": 19900 }, { "epoch": 2.3550981783770997, "grad_norm": 3.030785083770752, "learning_rate": 6.775950239968697e-08, "loss": 0.0652, "step": 19910 }, { "epoch": 2.356281050390348, "grad_norm": 3.048933506011963, "learning_rate": 6.763524143801507e-08, "loss": 0.0718, "step": 19920 }, { "epoch": 2.357463922403596, "grad_norm": 3.982017755508423, "learning_rate": 6.751098047634317e-08, "loss": 0.0701, "step": 19930 }, { "epoch": 2.358646794416844, "grad_norm": 3.071497917175293, "learning_rate": 6.738671951467127e-08, "loss": 0.0638, "step": 19940 }, { "epoch": 2.3598296664300924, "grad_norm": 3.911365032196045, "learning_rate": 6.726245855299937e-08, "loss": 0.0701, "step": 19950 }, { "epoch": 2.3610125384433402, "grad_norm": 2.8751795291900635, "learning_rate": 6.713819759132748e-08, "loss": 0.0703, "step": 19960 }, { "epoch": 2.3621954104565885, "grad_norm": 3.1359074115753174, "learning_rate": 6.701393662965557e-08, "loss": 0.0674, "step": 19970 }, { "epoch": 2.363378282469837, "grad_norm": 3.202326774597168, "learning_rate": 6.688967566798368e-08, "loss": 0.0716, "step": 19980 }, { "epoch": 2.364561154483085, "grad_norm": 3.496650457382202, "learning_rate": 6.676541470631177e-08, "loss": 0.0745, "step": 19990 }, { "epoch": 2.365744026496333, "grad_norm": 3.4988040924072266, "learning_rate": 6.664115374463988e-08, "loss": 0.0719, "step": 20000 }, { "epoch": 2.3669268985095813, "grad_norm": 3.153658628463745, "learning_rate": 6.651689278296797e-08, "loss": 0.0685, "step": 20010 }, { "epoch": 2.3681097705228296, "grad_norm": 3.0486085414886475, "learning_rate": 6.639263182129608e-08, "loss": 0.0707, "step": 20020 }, { "epoch": 2.3692926425360774, "grad_norm": 2.878833055496216, "learning_rate": 6.626837085962417e-08, "loss": 0.0662, "step": 20030 }, { "epoch": 2.3704755145493257, "grad_norm": 2.9199070930480957, "learning_rate": 6.614410989795228e-08, "loss": 0.0624, "step": 20040 }, { "epoch": 2.371658386562574, "grad_norm": 3.1896402835845947, "learning_rate": 6.601984893628037e-08, "loss": 0.0644, "step": 20050 }, { "epoch": 2.3728412585758223, "grad_norm": 2.818009614944458, "learning_rate": 6.589558797460848e-08, "loss": 0.0706, "step": 20060 }, { "epoch": 2.37402413058907, "grad_norm": 2.2367234230041504, "learning_rate": 6.577132701293657e-08, "loss": 0.0718, "step": 20070 }, { "epoch": 2.3752070026023184, "grad_norm": 2.315342664718628, "learning_rate": 6.564706605126468e-08, "loss": 0.0698, "step": 20080 }, { "epoch": 2.3763898746155667, "grad_norm": 3.0221710205078125, "learning_rate": 6.552280508959277e-08, "loss": 0.072, "step": 20090 }, { "epoch": 2.3775727466288146, "grad_norm": 3.3487725257873535, "learning_rate": 6.539854412792088e-08, "loss": 0.068, "step": 20100 }, { "epoch": 2.378755618642063, "grad_norm": 3.1544973850250244, "learning_rate": 6.527428316624897e-08, "loss": 0.0643, "step": 20110 }, { "epoch": 2.379938490655311, "grad_norm": 2.4769160747528076, "learning_rate": 6.515002220457708e-08, "loss": 0.0664, "step": 20120 }, { "epoch": 2.3811213626685594, "grad_norm": 2.3887939453125, "learning_rate": 6.502576124290517e-08, "loss": 0.0704, "step": 20130 }, { "epoch": 2.3823042346818073, "grad_norm": 3.7411394119262695, "learning_rate": 6.490150028123328e-08, "loss": 0.0728, "step": 20140 }, { "epoch": 2.3834871066950556, "grad_norm": 3.974454641342163, "learning_rate": 6.477723931956137e-08, "loss": 0.0609, "step": 20150 }, { "epoch": 2.384669978708304, "grad_norm": 3.158710479736328, "learning_rate": 6.465297835788948e-08, "loss": 0.0662, "step": 20160 }, { "epoch": 2.3858528507215517, "grad_norm": 2.7081620693206787, "learning_rate": 6.452871739621758e-08, "loss": 0.0655, "step": 20170 }, { "epoch": 2.3870357227348, "grad_norm": 3.5389180183410645, "learning_rate": 6.440445643454568e-08, "loss": 0.0747, "step": 20180 }, { "epoch": 2.3882185947480483, "grad_norm": 3.0176093578338623, "learning_rate": 6.428019547287378e-08, "loss": 0.0737, "step": 20190 }, { "epoch": 2.3894014667612966, "grad_norm": 2.9717726707458496, "learning_rate": 6.415593451120187e-08, "loss": 0.0693, "step": 20200 }, { "epoch": 2.3905843387745445, "grad_norm": 2.814851999282837, "learning_rate": 6.403167354952998e-08, "loss": 0.0663, "step": 20210 }, { "epoch": 2.3917672107877928, "grad_norm": 3.6892342567443848, "learning_rate": 6.390741258785809e-08, "loss": 0.0708, "step": 20220 }, { "epoch": 2.392950082801041, "grad_norm": 2.8098697662353516, "learning_rate": 6.378315162618618e-08, "loss": 0.0666, "step": 20230 }, { "epoch": 2.394132954814289, "grad_norm": 3.3528389930725098, "learning_rate": 6.365889066451429e-08, "loss": 0.0747, "step": 20240 }, { "epoch": 2.395315826827537, "grad_norm": 2.990567207336426, "learning_rate": 6.353462970284238e-08, "loss": 0.0726, "step": 20250 }, { "epoch": 2.3964986988407855, "grad_norm": 3.041092872619629, "learning_rate": 6.341036874117049e-08, "loss": 0.0684, "step": 20260 }, { "epoch": 2.3976815708540338, "grad_norm": 2.68809175491333, "learning_rate": 6.328610777949858e-08, "loss": 0.0664, "step": 20270 }, { "epoch": 2.3988644428672816, "grad_norm": 3.2429609298706055, "learning_rate": 6.316184681782669e-08, "loss": 0.0643, "step": 20280 }, { "epoch": 2.40004731488053, "grad_norm": 2.806102752685547, "learning_rate": 6.303758585615478e-08, "loss": 0.0664, "step": 20290 }, { "epoch": 2.401230186893778, "grad_norm": 3.4858832359313965, "learning_rate": 6.291332489448289e-08, "loss": 0.0694, "step": 20300 }, { "epoch": 2.402413058907026, "grad_norm": 4.34556245803833, "learning_rate": 6.278906393281098e-08, "loss": 0.0759, "step": 20310 }, { "epoch": 2.4035959309202743, "grad_norm": 2.6342098712921143, "learning_rate": 6.266480297113909e-08, "loss": 0.0658, "step": 20320 }, { "epoch": 2.4047788029335226, "grad_norm": 3.5682311058044434, "learning_rate": 6.254054200946718e-08, "loss": 0.0725, "step": 20330 }, { "epoch": 2.405961674946771, "grad_norm": 2.9417006969451904, "learning_rate": 6.241628104779527e-08, "loss": 0.0702, "step": 20340 }, { "epoch": 2.407144546960019, "grad_norm": 3.2417373657226562, "learning_rate": 6.229202008612338e-08, "loss": 0.0739, "step": 20350 }, { "epoch": 2.408327418973267, "grad_norm": 3.5985968112945557, "learning_rate": 6.216775912445147e-08, "loss": 0.0713, "step": 20360 }, { "epoch": 2.4095102909865154, "grad_norm": 4.010875225067139, "learning_rate": 6.204349816277958e-08, "loss": 0.0665, "step": 20370 }, { "epoch": 2.410693162999763, "grad_norm": 2.5911874771118164, "learning_rate": 6.191923720110768e-08, "loss": 0.0724, "step": 20380 }, { "epoch": 2.4118760350130115, "grad_norm": 3.5756425857543945, "learning_rate": 6.179497623943578e-08, "loss": 0.0619, "step": 20390 }, { "epoch": 2.41305890702626, "grad_norm": 2.9737308025360107, "learning_rate": 6.167071527776388e-08, "loss": 0.0674, "step": 20400 }, { "epoch": 2.414241779039508, "grad_norm": 3.136240243911743, "learning_rate": 6.154645431609198e-08, "loss": 0.0707, "step": 20410 }, { "epoch": 2.415424651052756, "grad_norm": 3.1338512897491455, "learning_rate": 6.142219335442008e-08, "loss": 0.076, "step": 20420 }, { "epoch": 2.4166075230660042, "grad_norm": 2.917064666748047, "learning_rate": 6.129793239274819e-08, "loss": 0.0686, "step": 20430 }, { "epoch": 2.4177903950792525, "grad_norm": 3.445251703262329, "learning_rate": 6.117367143107628e-08, "loss": 0.0653, "step": 20440 }, { "epoch": 2.4189732670925004, "grad_norm": 3.967658281326294, "learning_rate": 6.104941046940439e-08, "loss": 0.075, "step": 20450 }, { "epoch": 2.4201561391057487, "grad_norm": 4.382763385772705, "learning_rate": 6.092514950773248e-08, "loss": 0.0669, "step": 20460 }, { "epoch": 2.421339011118997, "grad_norm": 2.978804349899292, "learning_rate": 6.080088854606059e-08, "loss": 0.0689, "step": 20470 }, { "epoch": 2.4225218831322453, "grad_norm": 2.78229022026062, "learning_rate": 6.067662758438868e-08, "loss": 0.067, "step": 20480 }, { "epoch": 2.423704755145493, "grad_norm": 3.6299872398376465, "learning_rate": 6.055236662271679e-08, "loss": 0.0672, "step": 20490 }, { "epoch": 2.4248876271587414, "grad_norm": 3.432361364364624, "learning_rate": 6.042810566104488e-08, "loss": 0.0676, "step": 20500 }, { "epoch": 2.4260704991719897, "grad_norm": 3.24328351020813, "learning_rate": 6.030384469937299e-08, "loss": 0.0687, "step": 20510 }, { "epoch": 2.4272533711852375, "grad_norm": 3.053133487701416, "learning_rate": 6.017958373770108e-08, "loss": 0.0721, "step": 20520 }, { "epoch": 2.428436243198486, "grad_norm": 2.8688578605651855, "learning_rate": 6.005532277602919e-08, "loss": 0.0693, "step": 20530 }, { "epoch": 2.429619115211734, "grad_norm": 3.0003042221069336, "learning_rate": 5.993106181435728e-08, "loss": 0.0662, "step": 20540 }, { "epoch": 2.4308019872249824, "grad_norm": 2.603759527206421, "learning_rate": 5.980680085268539e-08, "loss": 0.0671, "step": 20550 }, { "epoch": 2.4319848592382303, "grad_norm": 3.256493091583252, "learning_rate": 5.968253989101348e-08, "loss": 0.0652, "step": 20560 }, { "epoch": 2.4331677312514786, "grad_norm": 2.4740326404571533, "learning_rate": 5.9558278929341586e-08, "loss": 0.0673, "step": 20570 }, { "epoch": 2.434350603264727, "grad_norm": 3.8654115200042725, "learning_rate": 5.9434017967669686e-08, "loss": 0.074, "step": 20580 }, { "epoch": 2.4355334752779747, "grad_norm": 3.378047466278076, "learning_rate": 5.9309757005997786e-08, "loss": 0.0658, "step": 20590 }, { "epoch": 2.436716347291223, "grad_norm": 3.0305588245391846, "learning_rate": 5.9185496044325885e-08, "loss": 0.0669, "step": 20600 }, { "epoch": 2.4378992193044713, "grad_norm": 3.0686771869659424, "learning_rate": 5.9061235082653985e-08, "loss": 0.0685, "step": 20610 }, { "epoch": 2.4390820913177196, "grad_norm": 2.918227195739746, "learning_rate": 5.8936974120982085e-08, "loss": 0.0645, "step": 20620 }, { "epoch": 2.4402649633309674, "grad_norm": 2.451552629470825, "learning_rate": 5.881271315931019e-08, "loss": 0.0731, "step": 20630 }, { "epoch": 2.4414478353442157, "grad_norm": 3.2491912841796875, "learning_rate": 5.8688452197638285e-08, "loss": 0.0657, "step": 20640 }, { "epoch": 2.442630707357464, "grad_norm": 3.8466570377349854, "learning_rate": 5.856419123596639e-08, "loss": 0.0694, "step": 20650 }, { "epoch": 2.443813579370712, "grad_norm": 3.965696096420288, "learning_rate": 5.8439930274294484e-08, "loss": 0.0666, "step": 20660 }, { "epoch": 2.44499645138396, "grad_norm": 3.1525020599365234, "learning_rate": 5.831566931262259e-08, "loss": 0.0651, "step": 20670 }, { "epoch": 2.4461793233972084, "grad_norm": 3.241399049758911, "learning_rate": 5.8191408350950684e-08, "loss": 0.0748, "step": 20680 }, { "epoch": 2.4473621954104567, "grad_norm": 3.820563554763794, "learning_rate": 5.806714738927879e-08, "loss": 0.0702, "step": 20690 }, { "epoch": 2.4485450674237046, "grad_norm": 2.667703151702881, "learning_rate": 5.794288642760689e-08, "loss": 0.0694, "step": 20700 }, { "epoch": 2.449727939436953, "grad_norm": 3.7656984329223633, "learning_rate": 5.781862546593499e-08, "loss": 0.067, "step": 20710 }, { "epoch": 2.450910811450201, "grad_norm": 2.811617612838745, "learning_rate": 5.769436450426309e-08, "loss": 0.0683, "step": 20720 }, { "epoch": 2.452093683463449, "grad_norm": 2.6551849842071533, "learning_rate": 5.757010354259118e-08, "loss": 0.0687, "step": 20730 }, { "epoch": 2.4532765554766973, "grad_norm": 3.8386309146881104, "learning_rate": 5.744584258091929e-08, "loss": 0.0714, "step": 20740 }, { "epoch": 2.4544594274899456, "grad_norm": 3.8310112953186035, "learning_rate": 5.732158161924738e-08, "loss": 0.0699, "step": 20750 }, { "epoch": 2.455642299503194, "grad_norm": 3.6500697135925293, "learning_rate": 5.719732065757549e-08, "loss": 0.0682, "step": 20760 }, { "epoch": 2.4568251715164418, "grad_norm": 2.423082113265991, "learning_rate": 5.707305969590359e-08, "loss": 0.0749, "step": 20770 }, { "epoch": 2.45800804352969, "grad_norm": 3.2503881454467773, "learning_rate": 5.694879873423169e-08, "loss": 0.0778, "step": 20780 }, { "epoch": 2.4591909155429383, "grad_norm": 4.175827980041504, "learning_rate": 5.682453777255979e-08, "loss": 0.0723, "step": 20790 }, { "epoch": 2.460373787556186, "grad_norm": 2.586982488632202, "learning_rate": 5.670027681088789e-08, "loss": 0.0665, "step": 20800 }, { "epoch": 2.4615566595694345, "grad_norm": 4.415169715881348, "learning_rate": 5.657601584921599e-08, "loss": 0.0691, "step": 20810 }, { "epoch": 2.4627395315826828, "grad_norm": 3.2696831226348877, "learning_rate": 5.645175488754409e-08, "loss": 0.0729, "step": 20820 }, { "epoch": 2.463922403595931, "grad_norm": 2.888906717300415, "learning_rate": 5.632749392587219e-08, "loss": 0.0719, "step": 20830 }, { "epoch": 2.465105275609179, "grad_norm": 2.999946117401123, "learning_rate": 5.6203232964200293e-08, "loss": 0.0696, "step": 20840 }, { "epoch": 2.466288147622427, "grad_norm": 3.8324460983276367, "learning_rate": 5.6078972002528386e-08, "loss": 0.0627, "step": 20850 }, { "epoch": 2.4674710196356755, "grad_norm": 3.1873254776000977, "learning_rate": 5.595471104085649e-08, "loss": 0.0626, "step": 20860 }, { "epoch": 2.4686538916489233, "grad_norm": 2.8496763706207275, "learning_rate": 5.5830450079184586e-08, "loss": 0.066, "step": 20870 }, { "epoch": 2.4698367636621716, "grad_norm": 4.160651683807373, "learning_rate": 5.570618911751269e-08, "loss": 0.0701, "step": 20880 }, { "epoch": 2.47101963567542, "grad_norm": 3.575068235397339, "learning_rate": 5.5581928155840786e-08, "loss": 0.0665, "step": 20890 }, { "epoch": 2.4722025076886682, "grad_norm": 3.710167169570923, "learning_rate": 5.545766719416889e-08, "loss": 0.072, "step": 20900 }, { "epoch": 2.473385379701916, "grad_norm": 2.958299398422241, "learning_rate": 5.533340623249699e-08, "loss": 0.0702, "step": 20910 }, { "epoch": 2.4745682517151644, "grad_norm": 3.0757930278778076, "learning_rate": 5.520914527082509e-08, "loss": 0.0671, "step": 20920 }, { "epoch": 2.4757511237284127, "grad_norm": 2.9666850566864014, "learning_rate": 5.508488430915319e-08, "loss": 0.0726, "step": 20930 }, { "epoch": 2.4769339957416605, "grad_norm": 3.446643114089966, "learning_rate": 5.496062334748129e-08, "loss": 0.0786, "step": 20940 }, { "epoch": 2.478116867754909, "grad_norm": 3.5303544998168945, "learning_rate": 5.483636238580939e-08, "loss": 0.0622, "step": 20950 }, { "epoch": 2.479299739768157, "grad_norm": 3.7855000495910645, "learning_rate": 5.47121014241375e-08, "loss": 0.0634, "step": 20960 }, { "epoch": 2.4804826117814054, "grad_norm": 3.407914400100708, "learning_rate": 5.458784046246559e-08, "loss": 0.0669, "step": 20970 }, { "epoch": 2.4816654837946532, "grad_norm": 3.120037317276001, "learning_rate": 5.44635795007937e-08, "loss": 0.0693, "step": 20980 }, { "epoch": 2.4828483558079015, "grad_norm": 3.043419361114502, "learning_rate": 5.433931853912179e-08, "loss": 0.0743, "step": 20990 }, { "epoch": 2.48403122782115, "grad_norm": 3.0903618335723877, "learning_rate": 5.4215057577449896e-08, "loss": 0.0657, "step": 21000 }, { "epoch": 2.4852140998343977, "grad_norm": 3.5894105434417725, "learning_rate": 5.409079661577799e-08, "loss": 0.0744, "step": 21010 }, { "epoch": 2.486396971847646, "grad_norm": 2.3074684143066406, "learning_rate": 5.3966535654106096e-08, "loss": 0.0619, "step": 21020 }, { "epoch": 2.4875798438608943, "grad_norm": 3.2901809215545654, "learning_rate": 5.384227469243419e-08, "loss": 0.0729, "step": 21030 }, { "epoch": 2.4887627158741426, "grad_norm": 2.6073129177093506, "learning_rate": 5.3718013730762295e-08, "loss": 0.0655, "step": 21040 }, { "epoch": 2.4899455878873904, "grad_norm": 4.002037525177002, "learning_rate": 5.3593752769090395e-08, "loss": 0.0678, "step": 21050 }, { "epoch": 2.4911284599006387, "grad_norm": 2.4820361137390137, "learning_rate": 5.3469491807418495e-08, "loss": 0.0629, "step": 21060 }, { "epoch": 2.492311331913887, "grad_norm": 3.4876818656921387, "learning_rate": 5.3345230845746595e-08, "loss": 0.0649, "step": 21070 }, { "epoch": 2.4934942039271353, "grad_norm": 2.3747706413269043, "learning_rate": 5.3220969884074695e-08, "loss": 0.069, "step": 21080 }, { "epoch": 2.494677075940383, "grad_norm": 3.019644260406494, "learning_rate": 5.3096708922402794e-08, "loss": 0.0746, "step": 21090 }, { "epoch": 2.4958599479536314, "grad_norm": 3.5671465396881104, "learning_rate": 5.29724479607309e-08, "loss": 0.0723, "step": 21100 }, { "epoch": 2.4970428199668797, "grad_norm": 3.3151583671569824, "learning_rate": 5.2848186999058994e-08, "loss": 0.0698, "step": 21110 }, { "epoch": 2.4982256919801276, "grad_norm": 3.0636043548583984, "learning_rate": 5.2723926037387094e-08, "loss": 0.0683, "step": 21120 }, { "epoch": 2.499408563993376, "grad_norm": 2.583174467086792, "learning_rate": 5.2599665075715193e-08, "loss": 0.0777, "step": 21130 }, { "epoch": 2.500591436006624, "grad_norm": 4.3315534591674805, "learning_rate": 5.247540411404329e-08, "loss": 0.0684, "step": 21140 }, { "epoch": 2.501774308019872, "grad_norm": 3.169900894165039, "learning_rate": 5.235114315237139e-08, "loss": 0.0775, "step": 21150 }, { "epoch": 2.5029571800331203, "grad_norm": 3.308530330657959, "learning_rate": 5.222688219069949e-08, "loss": 0.0675, "step": 21160 }, { "epoch": 2.5041400520463686, "grad_norm": 2.6778247356414795, "learning_rate": 5.21026212290276e-08, "loss": 0.0718, "step": 21170 }, { "epoch": 2.505322924059617, "grad_norm": 2.3848886489868164, "learning_rate": 5.197836026735569e-08, "loss": 0.0683, "step": 21180 }, { "epoch": 2.506505796072865, "grad_norm": 3.2060649394989014, "learning_rate": 5.18540993056838e-08, "loss": 0.0718, "step": 21190 }, { "epoch": 2.507688668086113, "grad_norm": 3.052159309387207, "learning_rate": 5.172983834401189e-08, "loss": 0.0719, "step": 21200 }, { "epoch": 2.5088715400993613, "grad_norm": 3.2646844387054443, "learning_rate": 5.160557738234e-08, "loss": 0.0577, "step": 21210 }, { "epoch": 2.510054412112609, "grad_norm": 3.175251007080078, "learning_rate": 5.148131642066809e-08, "loss": 0.0655, "step": 21220 }, { "epoch": 2.5112372841258574, "grad_norm": 3.3089234828948975, "learning_rate": 5.13570554589962e-08, "loss": 0.08, "step": 21230 }, { "epoch": 2.5124201561391057, "grad_norm": 3.0600051879882812, "learning_rate": 5.123279449732429e-08, "loss": 0.0724, "step": 21240 }, { "epoch": 2.513603028152354, "grad_norm": 3.680511474609375, "learning_rate": 5.11085335356524e-08, "loss": 0.0731, "step": 21250 }, { "epoch": 2.5147859001656023, "grad_norm": 3.4339873790740967, "learning_rate": 5.09842725739805e-08, "loss": 0.065, "step": 21260 }, { "epoch": 2.51596877217885, "grad_norm": 2.5370731353759766, "learning_rate": 5.08600116123086e-08, "loss": 0.0633, "step": 21270 }, { "epoch": 2.5171516441920985, "grad_norm": 2.546860694885254, "learning_rate": 5.07357506506367e-08, "loss": 0.0708, "step": 21280 }, { "epoch": 2.5183345162053463, "grad_norm": 2.777308464050293, "learning_rate": 5.0611489688964796e-08, "loss": 0.0657, "step": 21290 }, { "epoch": 2.5195173882185946, "grad_norm": 3.3679428100585938, "learning_rate": 5.0487228727292896e-08, "loss": 0.0691, "step": 21300 }, { "epoch": 2.520700260231843, "grad_norm": 4.507331371307373, "learning_rate": 5.0362967765621e-08, "loss": 0.0703, "step": 21310 }, { "epoch": 2.521883132245091, "grad_norm": 3.1866302490234375, "learning_rate": 5.0238706803949096e-08, "loss": 0.0658, "step": 21320 }, { "epoch": 2.5230660042583395, "grad_norm": 2.9365956783294678, "learning_rate": 5.01144458422772e-08, "loss": 0.068, "step": 21330 }, { "epoch": 2.5242488762715873, "grad_norm": 3.036160707473755, "learning_rate": 4.9990184880605295e-08, "loss": 0.0688, "step": 21340 }, { "epoch": 2.5254317482848356, "grad_norm": 3.0367228984832764, "learning_rate": 4.98659239189334e-08, "loss": 0.0696, "step": 21350 }, { "epoch": 2.5266146202980835, "grad_norm": 3.2429752349853516, "learning_rate": 4.9741662957261495e-08, "loss": 0.0618, "step": 21360 }, { "epoch": 2.5277974923113318, "grad_norm": 3.9371144771575928, "learning_rate": 4.96174019955896e-08, "loss": 0.064, "step": 21370 }, { "epoch": 2.52898036432458, "grad_norm": 2.527611255645752, "learning_rate": 4.94931410339177e-08, "loss": 0.066, "step": 21380 }, { "epoch": 2.5301632363378284, "grad_norm": 2.7044613361358643, "learning_rate": 4.93688800722458e-08, "loss": 0.0713, "step": 21390 }, { "epoch": 2.5313461083510767, "grad_norm": 3.3098816871643066, "learning_rate": 4.92446191105739e-08, "loss": 0.0616, "step": 21400 }, { "epoch": 2.5325289803643245, "grad_norm": 3.2602717876434326, "learning_rate": 4.9120358148902e-08, "loss": 0.0734, "step": 21410 }, { "epoch": 2.533711852377573, "grad_norm": 3.4712891578674316, "learning_rate": 4.89960971872301e-08, "loss": 0.0718, "step": 21420 }, { "epoch": 2.5348947243908206, "grad_norm": 2.452327013015747, "learning_rate": 4.8871836225558207e-08, "loss": 0.0697, "step": 21430 }, { "epoch": 2.536077596404069, "grad_norm": 3.637732982635498, "learning_rate": 4.87475752638863e-08, "loss": 0.0659, "step": 21440 }, { "epoch": 2.5372604684173172, "grad_norm": 2.8679518699645996, "learning_rate": 4.8623314302214406e-08, "loss": 0.0677, "step": 21450 }, { "epoch": 2.5384433404305655, "grad_norm": 3.387627601623535, "learning_rate": 4.84990533405425e-08, "loss": 0.0704, "step": 21460 }, { "epoch": 2.539626212443814, "grad_norm": 2.856396198272705, "learning_rate": 4.8374792378870606e-08, "loss": 0.0682, "step": 21470 }, { "epoch": 2.5408090844570617, "grad_norm": 3.6260440349578857, "learning_rate": 4.82505314171987e-08, "loss": 0.0712, "step": 21480 }, { "epoch": 2.54199195647031, "grad_norm": 3.367079257965088, "learning_rate": 4.8126270455526805e-08, "loss": 0.069, "step": 21490 }, { "epoch": 2.543174828483558, "grad_norm": 3.1531054973602295, "learning_rate": 4.80020094938549e-08, "loss": 0.0697, "step": 21500 }, { "epoch": 2.544357700496806, "grad_norm": 3.1578261852264404, "learning_rate": 4.7877748532183e-08, "loss": 0.0643, "step": 21510 }, { "epoch": 2.5455405725100544, "grad_norm": 3.352064371109009, "learning_rate": 4.7753487570511105e-08, "loss": 0.0685, "step": 21520 }, { "epoch": 2.5467234445233027, "grad_norm": 3.39077091217041, "learning_rate": 4.76292266088392e-08, "loss": 0.0752, "step": 21530 }, { "epoch": 2.547906316536551, "grad_norm": 2.9291720390319824, "learning_rate": 4.7504965647167304e-08, "loss": 0.0657, "step": 21540 }, { "epoch": 2.549089188549799, "grad_norm": 3.387965440750122, "learning_rate": 4.73807046854954e-08, "loss": 0.0674, "step": 21550 }, { "epoch": 2.550272060563047, "grad_norm": 4.256340980529785, "learning_rate": 4.7256443723823504e-08, "loss": 0.0623, "step": 21560 }, { "epoch": 2.5514549325762954, "grad_norm": 3.1629345417022705, "learning_rate": 4.71321827621516e-08, "loss": 0.069, "step": 21570 }, { "epoch": 2.5526378045895433, "grad_norm": 3.0780043601989746, "learning_rate": 4.70079218004797e-08, "loss": 0.0681, "step": 21580 }, { "epoch": 2.5538206766027916, "grad_norm": 4.033207893371582, "learning_rate": 4.68836608388078e-08, "loss": 0.0774, "step": 21590 }, { "epoch": 2.55500354861604, "grad_norm": 3.3976829051971436, "learning_rate": 4.67593998771359e-08, "loss": 0.0632, "step": 21600 }, { "epoch": 2.556186420629288, "grad_norm": 3.091001510620117, "learning_rate": 4.6635138915464e-08, "loss": 0.0692, "step": 21610 }, { "epoch": 2.557369292642536, "grad_norm": 3.6429781913757324, "learning_rate": 4.65108779537921e-08, "loss": 0.0667, "step": 21620 }, { "epoch": 2.5585521646557843, "grad_norm": 3.1973443031311035, "learning_rate": 4.63866169921202e-08, "loss": 0.0728, "step": 21630 }, { "epoch": 2.5597350366690326, "grad_norm": 3.286808967590332, "learning_rate": 4.626235603044831e-08, "loss": 0.0721, "step": 21640 }, { "epoch": 2.5609179086822804, "grad_norm": 3.3763904571533203, "learning_rate": 4.61380950687764e-08, "loss": 0.0702, "step": 21650 }, { "epoch": 2.5621007806955287, "grad_norm": 3.045109748840332, "learning_rate": 4.601383410710451e-08, "loss": 0.0638, "step": 21660 }, { "epoch": 2.563283652708777, "grad_norm": 2.5209641456604004, "learning_rate": 4.58895731454326e-08, "loss": 0.0637, "step": 21670 }, { "epoch": 2.5644665247220253, "grad_norm": 3.280564069747925, "learning_rate": 4.576531218376071e-08, "loss": 0.0751, "step": 21680 }, { "epoch": 2.565649396735273, "grad_norm": 3.2775492668151855, "learning_rate": 4.56410512220888e-08, "loss": 0.0738, "step": 21690 }, { "epoch": 2.5668322687485214, "grad_norm": 2.8992691040039062, "learning_rate": 4.551679026041691e-08, "loss": 0.0676, "step": 21700 }, { "epoch": 2.5680151407617697, "grad_norm": 2.876856565475464, "learning_rate": 4.5392529298745e-08, "loss": 0.0737, "step": 21710 }, { "epoch": 2.5691980127750176, "grad_norm": 3.402176856994629, "learning_rate": 4.526826833707311e-08, "loss": 0.0684, "step": 21720 }, { "epoch": 2.570380884788266, "grad_norm": 4.214558124542236, "learning_rate": 4.5144007375401206e-08, "loss": 0.0702, "step": 21730 }, { "epoch": 2.571563756801514, "grad_norm": 3.231106758117676, "learning_rate": 4.5019746413729306e-08, "loss": 0.0672, "step": 21740 }, { "epoch": 2.5727466288147625, "grad_norm": 3.048285484313965, "learning_rate": 4.4895485452057406e-08, "loss": 0.0681, "step": 21750 }, { "epoch": 2.5739295008280103, "grad_norm": 4.027153491973877, "learning_rate": 4.4771224490385506e-08, "loss": 0.0752, "step": 21760 }, { "epoch": 2.5751123728412586, "grad_norm": 2.9008939266204834, "learning_rate": 4.4646963528713606e-08, "loss": 0.0629, "step": 21770 }, { "epoch": 2.576295244854507, "grad_norm": 3.074000597000122, "learning_rate": 4.452270256704171e-08, "loss": 0.0688, "step": 21780 }, { "epoch": 2.5774781168677547, "grad_norm": 3.2186930179595947, "learning_rate": 4.4398441605369805e-08, "loss": 0.0708, "step": 21790 }, { "epoch": 2.578660988881003, "grad_norm": 2.7646713256835938, "learning_rate": 4.427418064369791e-08, "loss": 0.0721, "step": 21800 }, { "epoch": 2.5798438608942513, "grad_norm": 3.2666561603546143, "learning_rate": 4.4149919682026005e-08, "loss": 0.0708, "step": 21810 }, { "epoch": 2.5810267329074996, "grad_norm": 2.394519805908203, "learning_rate": 4.402565872035411e-08, "loss": 0.0663, "step": 21820 }, { "epoch": 2.5822096049207475, "grad_norm": 3.2482638359069824, "learning_rate": 4.3901397758682204e-08, "loss": 0.0675, "step": 21830 }, { "epoch": 2.5833924769339958, "grad_norm": 2.993562698364258, "learning_rate": 4.377713679701031e-08, "loss": 0.0673, "step": 21840 }, { "epoch": 2.584575348947244, "grad_norm": 4.0283355712890625, "learning_rate": 4.365287583533841e-08, "loss": 0.0728, "step": 21850 }, { "epoch": 2.585758220960492, "grad_norm": 2.251343011856079, "learning_rate": 4.352861487366651e-08, "loss": 0.0634, "step": 21860 }, { "epoch": 2.58694109297374, "grad_norm": 2.9414656162261963, "learning_rate": 4.340435391199461e-08, "loss": 0.0661, "step": 21870 }, { "epoch": 2.5881239649869885, "grad_norm": 2.415847063064575, "learning_rate": 4.328009295032271e-08, "loss": 0.0765, "step": 21880 }, { "epoch": 2.589306837000237, "grad_norm": 2.581651449203491, "learning_rate": 4.315583198865081e-08, "loss": 0.0673, "step": 21890 }, { "epoch": 2.5904897090134846, "grad_norm": 4.425063133239746, "learning_rate": 4.30315710269789e-08, "loss": 0.0679, "step": 21900 }, { "epoch": 2.591672581026733, "grad_norm": 3.7032532691955566, "learning_rate": 4.290731006530701e-08, "loss": 0.0701, "step": 21910 }, { "epoch": 2.592855453039981, "grad_norm": 3.931595802307129, "learning_rate": 4.27830491036351e-08, "loss": 0.0671, "step": 21920 }, { "epoch": 2.594038325053229, "grad_norm": 2.838977813720703, "learning_rate": 4.265878814196321e-08, "loss": 0.0748, "step": 21930 }, { "epoch": 2.5952211970664774, "grad_norm": 3.7482340335845947, "learning_rate": 4.253452718029131e-08, "loss": 0.0709, "step": 21940 }, { "epoch": 2.5964040690797257, "grad_norm": 3.7211782932281494, "learning_rate": 4.241026621861941e-08, "loss": 0.0606, "step": 21950 }, { "epoch": 2.597586941092974, "grad_norm": 3.2082037925720215, "learning_rate": 4.228600525694751e-08, "loss": 0.069, "step": 21960 }, { "epoch": 2.598769813106222, "grad_norm": 2.602865695953369, "learning_rate": 4.216174429527561e-08, "loss": 0.076, "step": 21970 }, { "epoch": 2.59995268511947, "grad_norm": 3.313108444213867, "learning_rate": 4.203748333360371e-08, "loss": 0.0678, "step": 21980 }, { "epoch": 2.6011355571327184, "grad_norm": 4.319264888763428, "learning_rate": 4.1913222371931814e-08, "loss": 0.0714, "step": 21990 }, { "epoch": 2.6023184291459662, "grad_norm": 2.712340831756592, "learning_rate": 4.178896141025991e-08, "loss": 0.0703, "step": 22000 }, { "epoch": 2.6035013011592145, "grad_norm": 3.3775196075439453, "learning_rate": 4.1664700448588013e-08, "loss": 0.0673, "step": 22010 }, { "epoch": 2.604684173172463, "grad_norm": 2.960801362991333, "learning_rate": 4.1540439486916107e-08, "loss": 0.0622, "step": 22020 }, { "epoch": 2.605867045185711, "grad_norm": 2.3393702507019043, "learning_rate": 4.141617852524421e-08, "loss": 0.0721, "step": 22030 }, { "epoch": 2.607049917198959, "grad_norm": 3.1801366806030273, "learning_rate": 4.1291917563572306e-08, "loss": 0.0721, "step": 22040 }, { "epoch": 2.6082327892122072, "grad_norm": 2.98709774017334, "learning_rate": 4.116765660190041e-08, "loss": 0.0646, "step": 22050 }, { "epoch": 2.6094156612254555, "grad_norm": 2.707758665084839, "learning_rate": 4.104339564022851e-08, "loss": 0.0685, "step": 22060 }, { "epoch": 2.6105985332387034, "grad_norm": 3.1815690994262695, "learning_rate": 4.091913467855661e-08, "loss": 0.0708, "step": 22070 }, { "epoch": 2.6117814052519517, "grad_norm": 3.331700563430786, "learning_rate": 4.079487371688471e-08, "loss": 0.0697, "step": 22080 }, { "epoch": 2.6129642772652, "grad_norm": 3.6442410945892334, "learning_rate": 4.067061275521281e-08, "loss": 0.0747, "step": 22090 }, { "epoch": 2.6141471492784483, "grad_norm": 2.8058922290802, "learning_rate": 4.054635179354091e-08, "loss": 0.0668, "step": 22100 }, { "epoch": 2.615330021291696, "grad_norm": 3.696707248687744, "learning_rate": 4.042209083186901e-08, "loss": 0.0673, "step": 22110 }, { "epoch": 2.6165128933049444, "grad_norm": 3.3231287002563477, "learning_rate": 4.029782987019711e-08, "loss": 0.0742, "step": 22120 }, { "epoch": 2.6176957653181927, "grad_norm": 3.163526773452759, "learning_rate": 4.017356890852522e-08, "loss": 0.0668, "step": 22130 }, { "epoch": 2.6188786373314406, "grad_norm": 3.257746696472168, "learning_rate": 4.004930794685331e-08, "loss": 0.0783, "step": 22140 }, { "epoch": 2.620061509344689, "grad_norm": 2.7941107749938965, "learning_rate": 3.992504698518142e-08, "loss": 0.0655, "step": 22150 }, { "epoch": 2.621244381357937, "grad_norm": 2.977311611175537, "learning_rate": 3.980078602350951e-08, "loss": 0.0655, "step": 22160 }, { "epoch": 2.6224272533711854, "grad_norm": 2.806717872619629, "learning_rate": 3.9676525061837616e-08, "loss": 0.0732, "step": 22170 }, { "epoch": 2.6236101253844333, "grad_norm": 3.268996000289917, "learning_rate": 3.955226410016571e-08, "loss": 0.071, "step": 22180 }, { "epoch": 2.6247929973976816, "grad_norm": 3.0925230979919434, "learning_rate": 3.9428003138493816e-08, "loss": 0.0693, "step": 22190 }, { "epoch": 2.62597586941093, "grad_norm": 4.48992395401001, "learning_rate": 3.9303742176821916e-08, "loss": 0.0701, "step": 22200 }, { "epoch": 2.6271587414241777, "grad_norm": 3.8978800773620605, "learning_rate": 3.9179481215150016e-08, "loss": 0.061, "step": 22210 }, { "epoch": 2.628341613437426, "grad_norm": 2.463874101638794, "learning_rate": 3.9055220253478115e-08, "loss": 0.0641, "step": 22220 }, { "epoch": 2.6295244854506743, "grad_norm": 3.396801710128784, "learning_rate": 3.8930959291806215e-08, "loss": 0.0725, "step": 22230 }, { "epoch": 2.6307073574639226, "grad_norm": 4.054720401763916, "learning_rate": 3.8806698330134315e-08, "loss": 0.0684, "step": 22240 }, { "epoch": 2.6318902294771704, "grad_norm": 3.775111198425293, "learning_rate": 3.868243736846242e-08, "loss": 0.0644, "step": 22250 }, { "epoch": 2.6330731014904187, "grad_norm": 3.0586581230163574, "learning_rate": 3.8558176406790514e-08, "loss": 0.0664, "step": 22260 }, { "epoch": 2.634255973503667, "grad_norm": 2.9350037574768066, "learning_rate": 3.843391544511862e-08, "loss": 0.0678, "step": 22270 }, { "epoch": 2.635438845516915, "grad_norm": 3.167886257171631, "learning_rate": 3.8309654483446714e-08, "loss": 0.0766, "step": 22280 }, { "epoch": 2.636621717530163, "grad_norm": 3.007800579071045, "learning_rate": 3.8185393521774814e-08, "loss": 0.0752, "step": 22290 }, { "epoch": 2.6378045895434115, "grad_norm": 3.2512412071228027, "learning_rate": 3.8061132560102914e-08, "loss": 0.0654, "step": 22300 }, { "epoch": 2.6389874615566598, "grad_norm": 3.2873430252075195, "learning_rate": 3.7936871598431013e-08, "loss": 0.0723, "step": 22310 }, { "epoch": 2.6401703335699076, "grad_norm": 3.4470338821411133, "learning_rate": 3.781261063675911e-08, "loss": 0.069, "step": 22320 }, { "epoch": 2.641353205583156, "grad_norm": 3.1342926025390625, "learning_rate": 3.768834967508721e-08, "loss": 0.0677, "step": 22330 }, { "epoch": 2.642536077596404, "grad_norm": 2.9987034797668457, "learning_rate": 3.756408871341532e-08, "loss": 0.0685, "step": 22340 }, { "epoch": 2.643718949609652, "grad_norm": 3.4063258171081543, "learning_rate": 3.743982775174341e-08, "loss": 0.0628, "step": 22350 }, { "epoch": 2.6449018216229003, "grad_norm": 3.303565502166748, "learning_rate": 3.731556679007152e-08, "loss": 0.06, "step": 22360 }, { "epoch": 2.6460846936361486, "grad_norm": 3.332014560699463, "learning_rate": 3.719130582839961e-08, "loss": 0.0664, "step": 22370 }, { "epoch": 2.647267565649397, "grad_norm": 3.3142497539520264, "learning_rate": 3.706704486672772e-08, "loss": 0.0744, "step": 22380 }, { "epoch": 2.6484504376626448, "grad_norm": 4.033775806427002, "learning_rate": 3.694278390505581e-08, "loss": 0.0729, "step": 22390 }, { "epoch": 2.649633309675893, "grad_norm": 3.7637715339660645, "learning_rate": 3.681852294338392e-08, "loss": 0.0708, "step": 22400 }, { "epoch": 2.6508161816891413, "grad_norm": 3.3785977363586426, "learning_rate": 3.669426198171202e-08, "loss": 0.0638, "step": 22410 }, { "epoch": 2.651999053702389, "grad_norm": 3.2724790573120117, "learning_rate": 3.657000102004012e-08, "loss": 0.0696, "step": 22420 }, { "epoch": 2.6531819257156375, "grad_norm": 3.6320011615753174, "learning_rate": 3.644574005836822e-08, "loss": 0.0714, "step": 22430 }, { "epoch": 2.654364797728886, "grad_norm": 3.5249099731445312, "learning_rate": 3.632147909669632e-08, "loss": 0.0738, "step": 22440 }, { "epoch": 2.655547669742134, "grad_norm": 3.825455904006958, "learning_rate": 3.619721813502442e-08, "loss": 0.0698, "step": 22450 }, { "epoch": 2.656730541755382, "grad_norm": 3.597285747528076, "learning_rate": 3.607295717335252e-08, "loss": 0.0673, "step": 22460 }, { "epoch": 2.65791341376863, "grad_norm": 4.432921886444092, "learning_rate": 3.5948696211680616e-08, "loss": 0.0718, "step": 22470 }, { "epoch": 2.6590962857818785, "grad_norm": 3.2154297828674316, "learning_rate": 3.582443525000872e-08, "loss": 0.0766, "step": 22480 }, { "epoch": 2.6602791577951264, "grad_norm": 3.0304551124572754, "learning_rate": 3.5700174288336816e-08, "loss": 0.0684, "step": 22490 }, { "epoch": 2.6614620298083747, "grad_norm": 3.509697198867798, "learning_rate": 3.557591332666492e-08, "loss": 0.0685, "step": 22500 }, { "epoch": 2.662644901821623, "grad_norm": 3.3298797607421875, "learning_rate": 3.5451652364993015e-08, "loss": 0.0694, "step": 22510 }, { "epoch": 2.6638277738348712, "grad_norm": 2.8463172912597656, "learning_rate": 3.5327391403321115e-08, "loss": 0.0675, "step": 22520 }, { "epoch": 2.665010645848119, "grad_norm": 3.5765838623046875, "learning_rate": 3.5203130441649215e-08, "loss": 0.074, "step": 22530 }, { "epoch": 2.6661935178613674, "grad_norm": 3.419750452041626, "learning_rate": 3.507886947997732e-08, "loss": 0.068, "step": 22540 }, { "epoch": 2.6673763898746157, "grad_norm": 2.8827641010284424, "learning_rate": 3.495460851830542e-08, "loss": 0.0701, "step": 22550 }, { "epoch": 2.6685592618878635, "grad_norm": 3.3743560314178467, "learning_rate": 3.483034755663352e-08, "loss": 0.0667, "step": 22560 }, { "epoch": 2.669742133901112, "grad_norm": 3.232088804244995, "learning_rate": 3.470608659496162e-08, "loss": 0.0628, "step": 22570 }, { "epoch": 2.67092500591436, "grad_norm": 2.6892333030700684, "learning_rate": 3.458182563328972e-08, "loss": 0.0721, "step": 22580 }, { "epoch": 2.6721078779276084, "grad_norm": 2.789792060852051, "learning_rate": 3.445756467161782e-08, "loss": 0.0725, "step": 22590 }, { "epoch": 2.6732907499408562, "grad_norm": 2.8546249866485596, "learning_rate": 3.433330370994592e-08, "loss": 0.0766, "step": 22600 }, { "epoch": 2.6744736219541045, "grad_norm": 3.307654857635498, "learning_rate": 3.420904274827402e-08, "loss": 0.0639, "step": 22610 }, { "epoch": 2.675656493967353, "grad_norm": 3.8087542057037354, "learning_rate": 3.408478178660212e-08, "loss": 0.0698, "step": 22620 }, { "epoch": 2.6768393659806007, "grad_norm": 3.6131536960601807, "learning_rate": 3.396052082493022e-08, "loss": 0.0673, "step": 22630 }, { "epoch": 2.678022237993849, "grad_norm": 3.471791982650757, "learning_rate": 3.383625986325832e-08, "loss": 0.0698, "step": 22640 }, { "epoch": 2.6792051100070973, "grad_norm": 3.727810859680176, "learning_rate": 3.371199890158642e-08, "loss": 0.0766, "step": 22650 }, { "epoch": 2.6803879820203456, "grad_norm": 3.693774700164795, "learning_rate": 3.358773793991452e-08, "loss": 0.0665, "step": 22660 }, { "epoch": 2.6815708540335934, "grad_norm": 2.5812134742736816, "learning_rate": 3.3463476978242625e-08, "loss": 0.0694, "step": 22670 }, { "epoch": 2.6827537260468417, "grad_norm": 3.0700087547302246, "learning_rate": 3.3339216016570725e-08, "loss": 0.067, "step": 22680 }, { "epoch": 2.68393659806009, "grad_norm": 3.778233528137207, "learning_rate": 3.3214955054898825e-08, "loss": 0.0741, "step": 22690 }, { "epoch": 2.685119470073338, "grad_norm": 3.913827657699585, "learning_rate": 3.3090694093226924e-08, "loss": 0.0695, "step": 22700 }, { "epoch": 2.686302342086586, "grad_norm": 3.754943609237671, "learning_rate": 3.2966433131555024e-08, "loss": 0.0684, "step": 22710 }, { "epoch": 2.6874852140998344, "grad_norm": 2.7651286125183105, "learning_rate": 3.2842172169883124e-08, "loss": 0.0675, "step": 22720 }, { "epoch": 2.6886680861130827, "grad_norm": 3.3525748252868652, "learning_rate": 3.2717911208211224e-08, "loss": 0.0743, "step": 22730 }, { "epoch": 2.6898509581263306, "grad_norm": 3.1835572719573975, "learning_rate": 3.2593650246539324e-08, "loss": 0.067, "step": 22740 }, { "epoch": 2.691033830139579, "grad_norm": 3.9008944034576416, "learning_rate": 3.2469389284867423e-08, "loss": 0.0686, "step": 22750 }, { "epoch": 2.692216702152827, "grad_norm": 3.7975831031799316, "learning_rate": 3.234512832319552e-08, "loss": 0.0668, "step": 22760 }, { "epoch": 2.693399574166075, "grad_norm": 2.604588031768799, "learning_rate": 3.222086736152362e-08, "loss": 0.0608, "step": 22770 }, { "epoch": 2.6945824461793233, "grad_norm": 2.8739511966705322, "learning_rate": 3.209660639985172e-08, "loss": 0.0764, "step": 22780 }, { "epoch": 2.6957653181925716, "grad_norm": 4.588569641113281, "learning_rate": 3.197234543817982e-08, "loss": 0.0699, "step": 22790 }, { "epoch": 2.69694819020582, "grad_norm": 3.087343692779541, "learning_rate": 3.184808447650793e-08, "loss": 0.066, "step": 22800 }, { "epoch": 2.6981310622190677, "grad_norm": 4.53203010559082, "learning_rate": 3.172382351483603e-08, "loss": 0.0698, "step": 22810 }, { "epoch": 2.699313934232316, "grad_norm": 2.917200803756714, "learning_rate": 3.159956255316413e-08, "loss": 0.0672, "step": 22820 }, { "epoch": 2.7004968062455643, "grad_norm": 3.4556140899658203, "learning_rate": 3.147530159149223e-08, "loss": 0.0688, "step": 22830 }, { "epoch": 2.701679678258812, "grad_norm": 4.337028503417969, "learning_rate": 3.135104062982033e-08, "loss": 0.0723, "step": 22840 }, { "epoch": 2.7028625502720605, "grad_norm": 2.9990217685699463, "learning_rate": 3.122677966814843e-08, "loss": 0.0656, "step": 22850 }, { "epoch": 2.7040454222853088, "grad_norm": 4.099867820739746, "learning_rate": 3.110251870647653e-08, "loss": 0.0728, "step": 22860 }, { "epoch": 2.705228294298557, "grad_norm": 2.9667274951934814, "learning_rate": 3.097825774480462e-08, "loss": 0.0608, "step": 22870 }, { "epoch": 2.7064111663118053, "grad_norm": 3.314774990081787, "learning_rate": 3.085399678313273e-08, "loss": 0.078, "step": 22880 }, { "epoch": 2.707594038325053, "grad_norm": 3.761791229248047, "learning_rate": 3.072973582146083e-08, "loss": 0.0655, "step": 22890 }, { "epoch": 2.7087769103383015, "grad_norm": 2.650496482849121, "learning_rate": 3.0605474859788927e-08, "loss": 0.0662, "step": 22900 }, { "epoch": 2.7099597823515493, "grad_norm": 3.7776522636413574, "learning_rate": 3.0481213898117026e-08, "loss": 0.0732, "step": 22910 }, { "epoch": 2.7111426543647976, "grad_norm": 3.8866970539093018, "learning_rate": 3.0356952936445126e-08, "loss": 0.0657, "step": 22920 }, { "epoch": 2.712325526378046, "grad_norm": 4.012598991394043, "learning_rate": 3.0232691974773226e-08, "loss": 0.0755, "step": 22930 }, { "epoch": 2.713508398391294, "grad_norm": 2.4890356063842773, "learning_rate": 3.0108431013101326e-08, "loss": 0.0777, "step": 22940 }, { "epoch": 2.7146912704045425, "grad_norm": 3.2825400829315186, "learning_rate": 2.9984170051429425e-08, "loss": 0.0676, "step": 22950 }, { "epoch": 2.7158741424177903, "grad_norm": 3.4787325859069824, "learning_rate": 2.9859909089757525e-08, "loss": 0.0673, "step": 22960 }, { "epoch": 2.7170570144310386, "grad_norm": 3.5525760650634766, "learning_rate": 2.9735648128085625e-08, "loss": 0.0689, "step": 22970 }, { "epoch": 2.7182398864442865, "grad_norm": 3.3804848194122314, "learning_rate": 2.9611387166413728e-08, "loss": 0.0644, "step": 22980 }, { "epoch": 2.719422758457535, "grad_norm": 3.3154397010803223, "learning_rate": 2.9487126204741828e-08, "loss": 0.0707, "step": 22990 }, { "epoch": 2.720605630470783, "grad_norm": 2.4002254009246826, "learning_rate": 2.9362865243069928e-08, "loss": 0.07, "step": 23000 }, { "epoch": 2.7217885024840314, "grad_norm": 5.03951358795166, "learning_rate": 2.9238604281398027e-08, "loss": 0.0646, "step": 23010 }, { "epoch": 2.7229713744972797, "grad_norm": 2.763324022293091, "learning_rate": 2.9114343319726127e-08, "loss": 0.0746, "step": 23020 }, { "epoch": 2.7241542465105275, "grad_norm": 3.1723828315734863, "learning_rate": 2.8990082358054227e-08, "loss": 0.0695, "step": 23030 }, { "epoch": 2.725337118523776, "grad_norm": 2.9355878829956055, "learning_rate": 2.886582139638233e-08, "loss": 0.0677, "step": 23040 }, { "epoch": 2.7265199905370237, "grad_norm": 3.972551107406616, "learning_rate": 2.874156043471043e-08, "loss": 0.0689, "step": 23050 }, { "epoch": 2.727702862550272, "grad_norm": 3.0345091819763184, "learning_rate": 2.861729947303853e-08, "loss": 0.0676, "step": 23060 }, { "epoch": 2.7288857345635202, "grad_norm": 3.044019937515259, "learning_rate": 2.849303851136663e-08, "loss": 0.065, "step": 23070 }, { "epoch": 2.7300686065767685, "grad_norm": 3.371647834777832, "learning_rate": 2.836877754969473e-08, "loss": 0.0735, "step": 23080 }, { "epoch": 2.731251478590017, "grad_norm": 3.1692323684692383, "learning_rate": 2.824451658802283e-08, "loss": 0.073, "step": 23090 }, { "epoch": 2.7324343506032647, "grad_norm": 3.1517064571380615, "learning_rate": 2.812025562635093e-08, "loss": 0.0659, "step": 23100 }, { "epoch": 2.733617222616513, "grad_norm": 4.5725812911987305, "learning_rate": 2.7995994664679032e-08, "loss": 0.0721, "step": 23110 }, { "epoch": 2.734800094629761, "grad_norm": 2.914660930633545, "learning_rate": 2.787173370300713e-08, "loss": 0.0663, "step": 23120 }, { "epoch": 2.735982966643009, "grad_norm": 3.196193218231201, "learning_rate": 2.774747274133523e-08, "loss": 0.0742, "step": 23130 }, { "epoch": 2.7371658386562574, "grad_norm": 3.5078625679016113, "learning_rate": 2.762321177966333e-08, "loss": 0.0773, "step": 23140 }, { "epoch": 2.7383487106695057, "grad_norm": 3.981733560562134, "learning_rate": 2.749895081799143e-08, "loss": 0.0757, "step": 23150 }, { "epoch": 2.739531582682754, "grad_norm": 3.2053661346435547, "learning_rate": 2.737468985631953e-08, "loss": 0.0693, "step": 23160 }, { "epoch": 2.740714454696002, "grad_norm": 3.1356325149536133, "learning_rate": 2.7250428894647634e-08, "loss": 0.0738, "step": 23170 }, { "epoch": 2.74189732670925, "grad_norm": 3.1126911640167236, "learning_rate": 2.7126167932975734e-08, "loss": 0.0745, "step": 23180 }, { "epoch": 2.743080198722498, "grad_norm": 3.7980241775512695, "learning_rate": 2.7001906971303833e-08, "loss": 0.0754, "step": 23190 }, { "epoch": 2.7442630707357463, "grad_norm": 4.280790328979492, "learning_rate": 2.6877646009631933e-08, "loss": 0.0778, "step": 23200 }, { "epoch": 2.7454459427489946, "grad_norm": 2.8880527019500732, "learning_rate": 2.6753385047960033e-08, "loss": 0.0698, "step": 23210 }, { "epoch": 2.746628814762243, "grad_norm": 2.9308314323425293, "learning_rate": 2.6629124086288133e-08, "loss": 0.0686, "step": 23220 }, { "epoch": 2.747811686775491, "grad_norm": 3.1716878414154053, "learning_rate": 2.6504863124616232e-08, "loss": 0.0705, "step": 23230 }, { "epoch": 2.748994558788739, "grad_norm": 2.9896187782287598, "learning_rate": 2.6380602162944336e-08, "loss": 0.0721, "step": 23240 }, { "epoch": 2.7501774308019873, "grad_norm": 3.468550443649292, "learning_rate": 2.6256341201272435e-08, "loss": 0.0658, "step": 23250 }, { "epoch": 2.751360302815235, "grad_norm": 3.858621597290039, "learning_rate": 2.6132080239600532e-08, "loss": 0.0696, "step": 23260 }, { "epoch": 2.7525431748284834, "grad_norm": 3.472635269165039, "learning_rate": 2.600781927792863e-08, "loss": 0.0607, "step": 23270 }, { "epoch": 2.7537260468417317, "grad_norm": 2.7996819019317627, "learning_rate": 2.588355831625673e-08, "loss": 0.0612, "step": 23280 }, { "epoch": 2.75490891885498, "grad_norm": 3.07797908782959, "learning_rate": 2.575929735458483e-08, "loss": 0.0671, "step": 23290 }, { "epoch": 2.7560917908682283, "grad_norm": 2.461479663848877, "learning_rate": 2.563503639291293e-08, "loss": 0.0615, "step": 23300 }, { "epoch": 2.757274662881476, "grad_norm": 3.829969882965088, "learning_rate": 2.551077543124103e-08, "loss": 0.0711, "step": 23310 }, { "epoch": 2.7584575348947244, "grad_norm": 4.139762878417969, "learning_rate": 2.5386514469569134e-08, "loss": 0.0719, "step": 23320 }, { "epoch": 2.7596404069079723, "grad_norm": 2.5766336917877197, "learning_rate": 2.5262253507897233e-08, "loss": 0.07, "step": 23330 }, { "epoch": 2.7608232789212206, "grad_norm": 2.9779305458068848, "learning_rate": 2.5137992546225333e-08, "loss": 0.0608, "step": 23340 }, { "epoch": 2.762006150934469, "grad_norm": 3.6536033153533936, "learning_rate": 2.5013731584553433e-08, "loss": 0.0685, "step": 23350 }, { "epoch": 2.763189022947717, "grad_norm": 3.829739570617676, "learning_rate": 2.4889470622881533e-08, "loss": 0.0658, "step": 23360 }, { "epoch": 2.7643718949609655, "grad_norm": 3.4438154697418213, "learning_rate": 2.4765209661209633e-08, "loss": 0.0777, "step": 23370 }, { "epoch": 2.7655547669742133, "grad_norm": 3.5677950382232666, "learning_rate": 2.4640948699537736e-08, "loss": 0.0754, "step": 23380 }, { "epoch": 2.7667376389874616, "grad_norm": 2.8858349323272705, "learning_rate": 2.4516687737865835e-08, "loss": 0.0633, "step": 23390 }, { "epoch": 2.7679205110007095, "grad_norm": 3.943669557571411, "learning_rate": 2.4392426776193935e-08, "loss": 0.0687, "step": 23400 }, { "epoch": 2.7691033830139578, "grad_norm": 2.989211320877075, "learning_rate": 2.4268165814522035e-08, "loss": 0.0644, "step": 23410 }, { "epoch": 2.770286255027206, "grad_norm": 2.899470090866089, "learning_rate": 2.4143904852850135e-08, "loss": 0.0742, "step": 23420 }, { "epoch": 2.7714691270404543, "grad_norm": 2.978602409362793, "learning_rate": 2.4019643891178235e-08, "loss": 0.0781, "step": 23430 }, { "epoch": 2.7726519990537026, "grad_norm": 3.2131423950195312, "learning_rate": 2.3895382929506334e-08, "loss": 0.0765, "step": 23440 }, { "epoch": 2.7738348710669505, "grad_norm": 3.4835116863250732, "learning_rate": 2.3771121967834437e-08, "loss": 0.0699, "step": 23450 }, { "epoch": 2.7750177430801988, "grad_norm": 3.8306548595428467, "learning_rate": 2.3646861006162537e-08, "loss": 0.0642, "step": 23460 }, { "epoch": 2.7762006150934466, "grad_norm": 3.384497880935669, "learning_rate": 2.3522600044490637e-08, "loss": 0.0674, "step": 23470 }, { "epoch": 2.777383487106695, "grad_norm": 2.712010145187378, "learning_rate": 2.3398339082818737e-08, "loss": 0.068, "step": 23480 }, { "epoch": 2.778566359119943, "grad_norm": 2.798992872238159, "learning_rate": 2.3274078121146837e-08, "loss": 0.0711, "step": 23490 }, { "epoch": 2.7797492311331915, "grad_norm": 3.615861177444458, "learning_rate": 2.3149817159474936e-08, "loss": 0.0732, "step": 23500 }, { "epoch": 2.78093210314644, "grad_norm": 2.850001573562622, "learning_rate": 2.3025556197803036e-08, "loss": 0.0588, "step": 23510 }, { "epoch": 2.7821149751596876, "grad_norm": 3.389594078063965, "learning_rate": 2.290129523613114e-08, "loss": 0.0672, "step": 23520 }, { "epoch": 2.783297847172936, "grad_norm": 3.630868911743164, "learning_rate": 2.277703427445924e-08, "loss": 0.064, "step": 23530 }, { "epoch": 2.784480719186184, "grad_norm": 3.5113329887390137, "learning_rate": 2.265277331278734e-08, "loss": 0.0699, "step": 23540 }, { "epoch": 2.785663591199432, "grad_norm": 4.105371952056885, "learning_rate": 2.252851235111544e-08, "loss": 0.0747, "step": 23550 }, { "epoch": 2.7868464632126804, "grad_norm": 4.118639945983887, "learning_rate": 2.2404251389443538e-08, "loss": 0.0712, "step": 23560 }, { "epoch": 2.7880293352259287, "grad_norm": 2.6430187225341797, "learning_rate": 2.2279990427771638e-08, "loss": 0.0669, "step": 23570 }, { "epoch": 2.789212207239177, "grad_norm": 3.698467969894409, "learning_rate": 2.215572946609974e-08, "loss": 0.0753, "step": 23580 }, { "epoch": 2.790395079252425, "grad_norm": 2.763742685317993, "learning_rate": 2.203146850442784e-08, "loss": 0.0681, "step": 23590 }, { "epoch": 2.791577951265673, "grad_norm": 3.5889534950256348, "learning_rate": 2.190720754275594e-08, "loss": 0.0761, "step": 23600 }, { "epoch": 2.792760823278921, "grad_norm": 2.697157144546509, "learning_rate": 2.178294658108404e-08, "loss": 0.0678, "step": 23610 }, { "epoch": 2.7939436952921692, "grad_norm": 3.1441104412078857, "learning_rate": 2.165868561941214e-08, "loss": 0.0791, "step": 23620 }, { "epoch": 2.7951265673054175, "grad_norm": 3.7634973526000977, "learning_rate": 2.153442465774024e-08, "loss": 0.0682, "step": 23630 }, { "epoch": 2.796309439318666, "grad_norm": 3.155895709991455, "learning_rate": 2.141016369606834e-08, "loss": 0.069, "step": 23640 }, { "epoch": 2.797492311331914, "grad_norm": 4.22262716293335, "learning_rate": 2.1285902734396436e-08, "loss": 0.0814, "step": 23650 }, { "epoch": 2.798675183345162, "grad_norm": 4.504295349121094, "learning_rate": 2.116164177272454e-08, "loss": 0.0709, "step": 23660 }, { "epoch": 2.7998580553584103, "grad_norm": 3.4668681621551514, "learning_rate": 2.103738081105264e-08, "loss": 0.0668, "step": 23670 }, { "epoch": 2.8010409273716586, "grad_norm": 2.937405586242676, "learning_rate": 2.091311984938074e-08, "loss": 0.0683, "step": 23680 }, { "epoch": 2.8022237993849064, "grad_norm": 3.4830100536346436, "learning_rate": 2.078885888770884e-08, "loss": 0.0732, "step": 23690 }, { "epoch": 2.8034066713981547, "grad_norm": 3.8185551166534424, "learning_rate": 2.066459792603694e-08, "loss": 0.077, "step": 23700 }, { "epoch": 2.804589543411403, "grad_norm": 4.0837931632995605, "learning_rate": 2.0540336964365038e-08, "loss": 0.0645, "step": 23710 }, { "epoch": 2.8057724154246513, "grad_norm": 12.1671781539917, "learning_rate": 2.0416076002693138e-08, "loss": 0.0613, "step": 23720 }, { "epoch": 2.806955287437899, "grad_norm": 2.7819197177886963, "learning_rate": 2.029181504102124e-08, "loss": 0.068, "step": 23730 }, { "epoch": 2.8081381594511474, "grad_norm": 2.412692070007324, "learning_rate": 2.016755407934934e-08, "loss": 0.0674, "step": 23740 }, { "epoch": 2.8093210314643957, "grad_norm": 3.309048891067505, "learning_rate": 2.004329311767744e-08, "loss": 0.0677, "step": 23750 }, { "epoch": 2.8105039034776436, "grad_norm": 4.172591686248779, "learning_rate": 1.991903215600554e-08, "loss": 0.0676, "step": 23760 }, { "epoch": 2.811686775490892, "grad_norm": 2.379964590072632, "learning_rate": 1.979477119433364e-08, "loss": 0.062, "step": 23770 }, { "epoch": 2.81286964750414, "grad_norm": 2.506260395050049, "learning_rate": 1.967051023266174e-08, "loss": 0.0707, "step": 23780 }, { "epoch": 2.8140525195173884, "grad_norm": 3.6394309997558594, "learning_rate": 1.9546249270989843e-08, "loss": 0.0794, "step": 23790 }, { "epoch": 2.8152353915306363, "grad_norm": 3.781012773513794, "learning_rate": 1.9421988309317943e-08, "loss": 0.0653, "step": 23800 }, { "epoch": 2.8164182635438846, "grad_norm": 2.8679256439208984, "learning_rate": 1.9297727347646043e-08, "loss": 0.0692, "step": 23810 }, { "epoch": 2.817601135557133, "grad_norm": 3.2986834049224854, "learning_rate": 1.9173466385974142e-08, "loss": 0.0714, "step": 23820 }, { "epoch": 2.8187840075703807, "grad_norm": 3.9402968883514404, "learning_rate": 1.9049205424302242e-08, "loss": 0.0846, "step": 23830 }, { "epoch": 2.819966879583629, "grad_norm": 3.0922346115112305, "learning_rate": 1.8924944462630342e-08, "loss": 0.0692, "step": 23840 }, { "epoch": 2.8211497515968773, "grad_norm": 3.4186081886291504, "learning_rate": 1.8800683500958442e-08, "loss": 0.0684, "step": 23850 }, { "epoch": 2.8223326236101256, "grad_norm": 3.5178170204162598, "learning_rate": 1.8676422539286545e-08, "loss": 0.0706, "step": 23860 }, { "epoch": 2.8235154956233735, "grad_norm": 3.22206449508667, "learning_rate": 1.8552161577614645e-08, "loss": 0.0666, "step": 23870 }, { "epoch": 2.8246983676366217, "grad_norm": 2.6437458992004395, "learning_rate": 1.8427900615942744e-08, "loss": 0.077, "step": 23880 }, { "epoch": 2.82588123964987, "grad_norm": 3.145684003829956, "learning_rate": 1.8303639654270844e-08, "loss": 0.0695, "step": 23890 }, { "epoch": 2.827064111663118, "grad_norm": 4.204176425933838, "learning_rate": 1.8179378692598944e-08, "loss": 0.0709, "step": 23900 }, { "epoch": 2.828246983676366, "grad_norm": 3.731091260910034, "learning_rate": 1.8055117730927044e-08, "loss": 0.0679, "step": 23910 }, { "epoch": 2.8294298556896145, "grad_norm": 3.7869977951049805, "learning_rate": 1.7930856769255147e-08, "loss": 0.0767, "step": 23920 }, { "epoch": 2.8306127277028628, "grad_norm": 3.077378749847412, "learning_rate": 1.7806595807583247e-08, "loss": 0.0644, "step": 23930 }, { "epoch": 2.8317955997161106, "grad_norm": 2.8287906646728516, "learning_rate": 1.7682334845911343e-08, "loss": 0.0679, "step": 23940 }, { "epoch": 2.832978471729359, "grad_norm": 3.2165982723236084, "learning_rate": 1.7558073884239443e-08, "loss": 0.07, "step": 23950 }, { "epoch": 2.834161343742607, "grad_norm": 3.6504950523376465, "learning_rate": 1.7433812922567546e-08, "loss": 0.0611, "step": 23960 }, { "epoch": 2.835344215755855, "grad_norm": 3.4370744228363037, "learning_rate": 1.7309551960895646e-08, "loss": 0.0721, "step": 23970 }, { "epoch": 2.8365270877691033, "grad_norm": 3.1095407009124756, "learning_rate": 1.7185290999223745e-08, "loss": 0.0751, "step": 23980 }, { "epoch": 2.8377099597823516, "grad_norm": 3.2289836406707764, "learning_rate": 1.7061030037551845e-08, "loss": 0.0785, "step": 23990 }, { "epoch": 2.8388928317956, "grad_norm": 3.572751522064209, "learning_rate": 1.6936769075879945e-08, "loss": 0.0678, "step": 24000 }, { "epoch": 2.8400757038088478, "grad_norm": 2.947082281112671, "learning_rate": 1.6812508114208045e-08, "loss": 0.0588, "step": 24010 }, { "epoch": 2.841258575822096, "grad_norm": 2.533470630645752, "learning_rate": 1.6688247152536145e-08, "loss": 0.0651, "step": 24020 }, { "epoch": 2.8424414478353444, "grad_norm": 3.0881171226501465, "learning_rate": 1.6563986190864248e-08, "loss": 0.0757, "step": 24030 }, { "epoch": 2.843624319848592, "grad_norm": 3.562513828277588, "learning_rate": 1.6439725229192347e-08, "loss": 0.074, "step": 24040 }, { "epoch": 2.8448071918618405, "grad_norm": 3.2912750244140625, "learning_rate": 1.6315464267520447e-08, "loss": 0.0655, "step": 24050 }, { "epoch": 2.845990063875089, "grad_norm": 4.055639743804932, "learning_rate": 1.6191203305848547e-08, "loss": 0.0635, "step": 24060 }, { "epoch": 2.847172935888337, "grad_norm": 4.253708839416504, "learning_rate": 1.6066942344176647e-08, "loss": 0.0696, "step": 24070 }, { "epoch": 2.848355807901585, "grad_norm": 3.0281968116760254, "learning_rate": 1.5942681382504746e-08, "loss": 0.0735, "step": 24080 }, { "epoch": 2.8495386799148332, "grad_norm": 2.376620292663574, "learning_rate": 1.581842042083285e-08, "loss": 0.0688, "step": 24090 }, { "epoch": 2.8507215519280815, "grad_norm": 3.41910982131958, "learning_rate": 1.569415945916095e-08, "loss": 0.0711, "step": 24100 }, { "epoch": 2.8519044239413294, "grad_norm": 3.1903233528137207, "learning_rate": 1.556989849748905e-08, "loss": 0.0652, "step": 24110 }, { "epoch": 2.8530872959545777, "grad_norm": 2.2977607250213623, "learning_rate": 1.544563753581715e-08, "loss": 0.0672, "step": 24120 }, { "epoch": 2.854270167967826, "grad_norm": 3.158501148223877, "learning_rate": 1.532137657414525e-08, "loss": 0.0812, "step": 24130 }, { "epoch": 2.8554530399810742, "grad_norm": 2.666475772857666, "learning_rate": 1.519711561247335e-08, "loss": 0.0678, "step": 24140 }, { "epoch": 2.856635911994322, "grad_norm": 2.8212578296661377, "learning_rate": 1.5072854650801448e-08, "loss": 0.0692, "step": 24150 }, { "epoch": 2.8578187840075704, "grad_norm": 3.044511079788208, "learning_rate": 1.4948593689129548e-08, "loss": 0.0666, "step": 24160 }, { "epoch": 2.8590016560208187, "grad_norm": 2.721667528152466, "learning_rate": 1.4824332727457648e-08, "loss": 0.0608, "step": 24170 }, { "epoch": 2.8601845280340665, "grad_norm": 4.513845443725586, "learning_rate": 1.470007176578575e-08, "loss": 0.0725, "step": 24180 }, { "epoch": 2.861367400047315, "grad_norm": 2.461515426635742, "learning_rate": 1.4575810804113849e-08, "loss": 0.0677, "step": 24190 }, { "epoch": 2.862550272060563, "grad_norm": 3.4336321353912354, "learning_rate": 1.4451549842441949e-08, "loss": 0.0726, "step": 24200 }, { "epoch": 2.8637331440738114, "grad_norm": 3.385463237762451, "learning_rate": 1.4327288880770049e-08, "loss": 0.0698, "step": 24210 }, { "epoch": 2.8649160160870593, "grad_norm": 2.8916895389556885, "learning_rate": 1.420302791909815e-08, "loss": 0.0627, "step": 24220 }, { "epoch": 2.8660988881003076, "grad_norm": 2.9126689434051514, "learning_rate": 1.407876695742625e-08, "loss": 0.0713, "step": 24230 }, { "epoch": 2.867281760113556, "grad_norm": 3.204958438873291, "learning_rate": 1.395450599575435e-08, "loss": 0.0638, "step": 24240 }, { "epoch": 2.8684646321268037, "grad_norm": 3.1574063301086426, "learning_rate": 1.3830245034082451e-08, "loss": 0.0653, "step": 24250 }, { "epoch": 2.869647504140052, "grad_norm": 3.26656436920166, "learning_rate": 1.370598407241055e-08, "loss": 0.0703, "step": 24260 }, { "epoch": 2.8708303761533003, "grad_norm": 2.7431623935699463, "learning_rate": 1.358172311073865e-08, "loss": 0.065, "step": 24270 }, { "epoch": 2.8720132481665486, "grad_norm": 2.881309747695923, "learning_rate": 1.3457462149066752e-08, "loss": 0.0717, "step": 24280 }, { "epoch": 2.8731961201797964, "grad_norm": 3.3779397010803223, "learning_rate": 1.3333201187394852e-08, "loss": 0.0614, "step": 24290 }, { "epoch": 2.8743789921930447, "grad_norm": 3.177034378051758, "learning_rate": 1.3208940225722951e-08, "loss": 0.0608, "step": 24300 }, { "epoch": 2.875561864206293, "grad_norm": 3.1794652938842773, "learning_rate": 1.3084679264051053e-08, "loss": 0.0654, "step": 24310 }, { "epoch": 2.876744736219541, "grad_norm": 3.59674334526062, "learning_rate": 1.2960418302379153e-08, "loss": 0.0649, "step": 24320 }, { "epoch": 2.877927608232789, "grad_norm": 2.4915788173675537, "learning_rate": 1.283615734070725e-08, "loss": 0.0677, "step": 24330 }, { "epoch": 2.8791104802460374, "grad_norm": 3.743321180343628, "learning_rate": 1.271189637903535e-08, "loss": 0.0637, "step": 24340 }, { "epoch": 2.8802933522592857, "grad_norm": 3.375450372695923, "learning_rate": 1.2587635417363452e-08, "loss": 0.0762, "step": 24350 }, { "epoch": 2.8814762242725336, "grad_norm": 3.3149499893188477, "learning_rate": 1.2463374455691552e-08, "loss": 0.0637, "step": 24360 }, { "epoch": 2.882659096285782, "grad_norm": 2.4577083587646484, "learning_rate": 1.2339113494019652e-08, "loss": 0.0657, "step": 24370 }, { "epoch": 2.88384196829903, "grad_norm": 2.8333680629730225, "learning_rate": 1.2214852532347751e-08, "loss": 0.0661, "step": 24380 }, { "epoch": 2.885024840312278, "grad_norm": 3.592026710510254, "learning_rate": 1.2090591570675853e-08, "loss": 0.0717, "step": 24390 }, { "epoch": 2.8862077123255263, "grad_norm": 3.7010674476623535, "learning_rate": 1.1966330609003953e-08, "loss": 0.0726, "step": 24400 }, { "epoch": 2.8873905843387746, "grad_norm": 3.846701145172119, "learning_rate": 1.1842069647332052e-08, "loss": 0.0719, "step": 24410 }, { "epoch": 2.888573456352023, "grad_norm": 3.1845052242279053, "learning_rate": 1.1717808685660154e-08, "loss": 0.0579, "step": 24420 }, { "epoch": 2.8897563283652707, "grad_norm": 2.8258938789367676, "learning_rate": 1.1593547723988254e-08, "loss": 0.0713, "step": 24430 }, { "epoch": 2.890939200378519, "grad_norm": 3.2606780529022217, "learning_rate": 1.1469286762316353e-08, "loss": 0.0686, "step": 24440 }, { "epoch": 2.8921220723917673, "grad_norm": 3.986778497695923, "learning_rate": 1.1345025800644455e-08, "loss": 0.0658, "step": 24450 }, { "epoch": 2.893304944405015, "grad_norm": 2.958326816558838, "learning_rate": 1.1220764838972555e-08, "loss": 0.0676, "step": 24460 }, { "epoch": 2.8944878164182635, "grad_norm": 2.464677095413208, "learning_rate": 1.1096503877300654e-08, "loss": 0.0649, "step": 24470 }, { "epoch": 2.8956706884315118, "grad_norm": 3.422867774963379, "learning_rate": 1.0972242915628754e-08, "loss": 0.075, "step": 24480 }, { "epoch": 2.89685356044476, "grad_norm": 2.794595241546631, "learning_rate": 1.0847981953956855e-08, "loss": 0.0713, "step": 24490 }, { "epoch": 2.898036432458008, "grad_norm": 3.2402408123016357, "learning_rate": 1.0723720992284955e-08, "loss": 0.0715, "step": 24500 }, { "epoch": 2.899219304471256, "grad_norm": 4.363686561584473, "learning_rate": 1.0599460030613055e-08, "loss": 0.066, "step": 24510 }, { "epoch": 2.9004021764845045, "grad_norm": 5.714462757110596, "learning_rate": 1.0475199068941156e-08, "loss": 0.0716, "step": 24520 }, { "epoch": 2.9015850484977523, "grad_norm": 3.017280340194702, "learning_rate": 1.0350938107269255e-08, "loss": 0.0725, "step": 24530 }, { "epoch": 2.9027679205110006, "grad_norm": 2.9771697521209717, "learning_rate": 1.0226677145597354e-08, "loss": 0.0675, "step": 24540 }, { "epoch": 2.903950792524249, "grad_norm": 3.198369264602661, "learning_rate": 1.0102416183925454e-08, "loss": 0.0717, "step": 24550 }, { "epoch": 2.905133664537497, "grad_norm": 2.723705291748047, "learning_rate": 9.978155222253556e-09, "loss": 0.0652, "step": 24560 }, { "epoch": 2.906316536550745, "grad_norm": 3.161634922027588, "learning_rate": 9.853894260581655e-09, "loss": 0.0646, "step": 24570 }, { "epoch": 2.9074994085639934, "grad_norm": 3.6538448333740234, "learning_rate": 9.729633298909755e-09, "loss": 0.0674, "step": 24580 }, { "epoch": 2.9086822805772417, "grad_norm": 3.818331241607666, "learning_rate": 9.605372337237857e-09, "loss": 0.0682, "step": 24590 }, { "epoch": 2.9098651525904895, "grad_norm": 3.170600414276123, "learning_rate": 9.481111375565956e-09, "loss": 0.0651, "step": 24600 }, { "epoch": 2.911048024603738, "grad_norm": 4.530123233795166, "learning_rate": 9.356850413894056e-09, "loss": 0.0736, "step": 24610 }, { "epoch": 2.912230896616986, "grad_norm": 3.154425859451294, "learning_rate": 9.232589452222158e-09, "loss": 0.0669, "step": 24620 }, { "epoch": 2.9134137686302344, "grad_norm": 3.63489031791687, "learning_rate": 9.108328490550257e-09, "loss": 0.0724, "step": 24630 }, { "epoch": 2.9145966406434822, "grad_norm": 3.4081993103027344, "learning_rate": 8.984067528878357e-09, "loss": 0.0674, "step": 24640 }, { "epoch": 2.9157795126567305, "grad_norm": 3.8762738704681396, "learning_rate": 8.859806567206457e-09, "loss": 0.0721, "step": 24650 }, { "epoch": 2.916962384669979, "grad_norm": 3.949063301086426, "learning_rate": 8.735545605534558e-09, "loss": 0.0714, "step": 24660 }, { "epoch": 2.9181452566832267, "grad_norm": 3.100996255874634, "learning_rate": 8.611284643862658e-09, "loss": 0.0673, "step": 24670 }, { "epoch": 2.919328128696475, "grad_norm": 2.6631784439086914, "learning_rate": 8.487023682190758e-09, "loss": 0.0776, "step": 24680 }, { "epoch": 2.9205110007097232, "grad_norm": 3.5142295360565186, "learning_rate": 8.362762720518858e-09, "loss": 0.0664, "step": 24690 }, { "epoch": 2.9216938727229715, "grad_norm": 3.791541576385498, "learning_rate": 8.238501758846957e-09, "loss": 0.0705, "step": 24700 }, { "epoch": 2.9228767447362194, "grad_norm": 3.6104886531829834, "learning_rate": 8.114240797175059e-09, "loss": 0.0679, "step": 24710 }, { "epoch": 2.9240596167494677, "grad_norm": 2.0339722633361816, "learning_rate": 7.989979835503159e-09, "loss": 0.0647, "step": 24720 }, { "epoch": 2.925242488762716, "grad_norm": 3.064103841781616, "learning_rate": 7.865718873831258e-09, "loss": 0.0715, "step": 24730 }, { "epoch": 2.926425360775964, "grad_norm": 3.099675178527832, "learning_rate": 7.74145791215936e-09, "loss": 0.0662, "step": 24740 }, { "epoch": 2.927608232789212, "grad_norm": 3.479001760482788, "learning_rate": 7.61719695048746e-09, "loss": 0.0674, "step": 24750 }, { "epoch": 2.9287911048024604, "grad_norm": 3.799346923828125, "learning_rate": 7.49293598881556e-09, "loss": 0.0769, "step": 24760 }, { "epoch": 2.9299739768157087, "grad_norm": 3.6996641159057617, "learning_rate": 7.368675027143659e-09, "loss": 0.0644, "step": 24770 }, { "epoch": 2.9311568488289566, "grad_norm": 2.9587948322296143, "learning_rate": 7.244414065471759e-09, "loss": 0.0694, "step": 24780 }, { "epoch": 2.932339720842205, "grad_norm": 2.906644821166992, "learning_rate": 7.1201531037998595e-09, "loss": 0.0669, "step": 24790 }, { "epoch": 2.933522592855453, "grad_norm": 4.5165019035339355, "learning_rate": 6.99589214212796e-09, "loss": 0.065, "step": 24800 }, { "epoch": 2.934705464868701, "grad_norm": 3.041423797607422, "learning_rate": 6.87163118045606e-09, "loss": 0.0675, "step": 24810 }, { "epoch": 2.9358883368819493, "grad_norm": 2.9522831439971924, "learning_rate": 6.7473702187841605e-09, "loss": 0.0678, "step": 24820 }, { "epoch": 2.9370712088951976, "grad_norm": 2.847414970397949, "learning_rate": 6.62310925711226e-09, "loss": 0.0706, "step": 24830 }, { "epoch": 2.938254080908446, "grad_norm": 3.2963452339172363, "learning_rate": 6.498848295440361e-09, "loss": 0.07, "step": 24840 }, { "epoch": 2.9394369529216937, "grad_norm": 3.133747100830078, "learning_rate": 6.3745873337684615e-09, "loss": 0.0665, "step": 24850 }, { "epoch": 2.940619824934942, "grad_norm": 3.4241445064544678, "learning_rate": 6.250326372096561e-09, "loss": 0.0592, "step": 24860 }, { "epoch": 2.9418026969481903, "grad_norm": 2.8078622817993164, "learning_rate": 6.126065410424661e-09, "loss": 0.0674, "step": 24870 }, { "epoch": 2.942985568961438, "grad_norm": 3.6524698734283447, "learning_rate": 6.001804448752761e-09, "loss": 0.0708, "step": 24880 }, { "epoch": 2.9441684409746864, "grad_norm": 3.7947916984558105, "learning_rate": 5.877543487080861e-09, "loss": 0.0622, "step": 24890 }, { "epoch": 2.9453513129879347, "grad_norm": 2.5568418502807617, "learning_rate": 5.753282525408962e-09, "loss": 0.0688, "step": 24900 }, { "epoch": 2.946534185001183, "grad_norm": 2.9753777980804443, "learning_rate": 5.629021563737062e-09, "loss": 0.0623, "step": 24910 }, { "epoch": 2.947717057014431, "grad_norm": 3.3387529850006104, "learning_rate": 5.504760602065162e-09, "loss": 0.0711, "step": 24920 }, { "epoch": 2.948899929027679, "grad_norm": 3.1687309741973877, "learning_rate": 5.380499640393262e-09, "loss": 0.0648, "step": 24930 }, { "epoch": 2.9500828010409275, "grad_norm": 3.90529465675354, "learning_rate": 5.256238678721363e-09, "loss": 0.0696, "step": 24940 }, { "epoch": 2.9512656730541753, "grad_norm": 3.0549163818359375, "learning_rate": 5.131977717049463e-09, "loss": 0.059, "step": 24950 }, { "epoch": 2.9524485450674236, "grad_norm": 2.866260290145874, "learning_rate": 5.007716755377563e-09, "loss": 0.055, "step": 24960 }, { "epoch": 2.953631417080672, "grad_norm": 2.9825284481048584, "learning_rate": 4.883455793705663e-09, "loss": 0.0717, "step": 24970 }, { "epoch": 2.95481428909392, "grad_norm": 3.4534432888031006, "learning_rate": 4.759194832033763e-09, "loss": 0.0688, "step": 24980 }, { "epoch": 2.9559971611071685, "grad_norm": 3.17519474029541, "learning_rate": 4.634933870361863e-09, "loss": 0.0701, "step": 24990 }, { "epoch": 2.9571800331204163, "grad_norm": 3.3433213233947754, "learning_rate": 4.510672908689963e-09, "loss": 0.0701, "step": 25000 }, { "epoch": 2.9583629051336646, "grad_norm": 3.48189377784729, "learning_rate": 4.386411947018064e-09, "loss": 0.0634, "step": 25010 }, { "epoch": 2.9595457771469125, "grad_norm": 2.7418007850646973, "learning_rate": 4.262150985346164e-09, "loss": 0.064, "step": 25020 }, { "epoch": 2.9607286491601608, "grad_norm": 2.6989002227783203, "learning_rate": 4.137890023674264e-09, "loss": 0.067, "step": 25030 }, { "epoch": 2.961911521173409, "grad_norm": 2.7168571949005127, "learning_rate": 4.013629062002364e-09, "loss": 0.0698, "step": 25040 }, { "epoch": 2.9630943931866573, "grad_norm": 2.6578383445739746, "learning_rate": 3.8893681003304644e-09, "loss": 0.0709, "step": 25050 }, { "epoch": 2.9642772651999056, "grad_norm": 3.460569381713867, "learning_rate": 3.765107138658564e-09, "loss": 0.0614, "step": 25060 }, { "epoch": 2.9654601372131535, "grad_norm": 3.3404994010925293, "learning_rate": 3.640846176986665e-09, "loss": 0.0739, "step": 25070 }, { "epoch": 2.966643009226402, "grad_norm": 2.6047067642211914, "learning_rate": 3.516585215314765e-09, "loss": 0.0705, "step": 25080 }, { "epoch": 2.9678258812396496, "grad_norm": 3.4798266887664795, "learning_rate": 3.3923242536428648e-09, "loss": 0.0694, "step": 25090 }, { "epoch": 2.969008753252898, "grad_norm": 4.240628719329834, "learning_rate": 3.2680632919709654e-09, "loss": 0.0678, "step": 25100 }, { "epoch": 2.970191625266146, "grad_norm": 3.9309751987457275, "learning_rate": 3.1438023302990656e-09, "loss": 0.0679, "step": 25110 }, { "epoch": 2.9713744972793945, "grad_norm": 3.1011784076690674, "learning_rate": 3.0195413686271658e-09, "loss": 0.0645, "step": 25120 }, { "epoch": 2.972557369292643, "grad_norm": 3.8540890216827393, "learning_rate": 2.895280406955266e-09, "loss": 0.0723, "step": 25130 }, { "epoch": 2.9737402413058907, "grad_norm": 3.0971550941467285, "learning_rate": 2.7710194452833657e-09, "loss": 0.0764, "step": 25140 }, { "epoch": 2.974923113319139, "grad_norm": 3.431619882583618, "learning_rate": 2.6467584836114663e-09, "loss": 0.065, "step": 25150 }, { "epoch": 2.976105985332387, "grad_norm": 3.1324000358581543, "learning_rate": 2.5224975219395665e-09, "loss": 0.0626, "step": 25160 }, { "epoch": 2.977288857345635, "grad_norm": 2.7309682369232178, "learning_rate": 2.3982365602676667e-09, "loss": 0.0621, "step": 25170 }, { "epoch": 2.9784717293588834, "grad_norm": 3.4220492839813232, "learning_rate": 2.273975598595767e-09, "loss": 0.0692, "step": 25180 }, { "epoch": 2.9796546013721317, "grad_norm": 2.560696601867676, "learning_rate": 2.149714636923867e-09, "loss": 0.0744, "step": 25190 }, { "epoch": 2.98083747338538, "grad_norm": 3.515239953994751, "learning_rate": 2.025453675251967e-09, "loss": 0.0657, "step": 25200 }, { "epoch": 2.982020345398628, "grad_norm": 3.101961851119995, "learning_rate": 1.9011927135800675e-09, "loss": 0.0675, "step": 25210 }, { "epoch": 2.983203217411876, "grad_norm": 2.3070228099823, "learning_rate": 1.7769317519081672e-09, "loss": 0.0659, "step": 25220 }, { "epoch": 2.984386089425124, "grad_norm": 3.118229627609253, "learning_rate": 1.6526707902362676e-09, "loss": 0.0751, "step": 25230 }, { "epoch": 2.9855689614383722, "grad_norm": 3.6132593154907227, "learning_rate": 1.5284098285643678e-09, "loss": 0.0716, "step": 25240 }, { "epoch": 2.9867518334516205, "grad_norm": 3.0998775959014893, "learning_rate": 1.404148866892468e-09, "loss": 0.0682, "step": 25250 }, { "epoch": 2.987934705464869, "grad_norm": 2.6996052265167236, "learning_rate": 1.2798879052205682e-09, "loss": 0.0723, "step": 25260 }, { "epoch": 2.989117577478117, "grad_norm": 2.944080114364624, "learning_rate": 1.1556269435486682e-09, "loss": 0.0695, "step": 25270 }, { "epoch": 2.990300449491365, "grad_norm": 2.7454302310943604, "learning_rate": 1.0313659818767686e-09, "loss": 0.0745, "step": 25280 }, { "epoch": 2.9914833215046133, "grad_norm": 2.5875661373138428, "learning_rate": 9.071050202048687e-10, "loss": 0.0646, "step": 25290 }, { "epoch": 2.992666193517861, "grad_norm": 3.329366683959961, "learning_rate": 7.828440585329689e-10, "loss": 0.0648, "step": 25300 }, { "epoch": 2.9938490655311094, "grad_norm": 2.7645976543426514, "learning_rate": 6.585830968610692e-10, "loss": 0.0679, "step": 25310 }, { "epoch": 2.9950319375443577, "grad_norm": 2.899848699569702, "learning_rate": 5.343221351891692e-10, "loss": 0.0644, "step": 25320 }, { "epoch": 2.996214809557606, "grad_norm": 3.0567593574523926, "learning_rate": 4.1006117351726944e-10, "loss": 0.0703, "step": 25330 }, { "epoch": 2.9973976815708543, "grad_norm": 2.7045652866363525, "learning_rate": 2.858002118453696e-10, "loss": 0.0708, "step": 25340 }, { "epoch": 2.998580553584102, "grad_norm": 2.604355812072754, "learning_rate": 1.6153925017346976e-10, "loss": 0.0721, "step": 25350 }, { "epoch": 2.9997634255973504, "grad_norm": 3.1211905479431152, "learning_rate": 3.727828850156995e-11, "loss": 0.0607, "step": 25360 }, { "epoch": 3.0, "grad_norm": 3.0697312355041504, "learning_rate": 1.2426096167189982e-11, "loss": 0.06, "step": 25362 }, { "epoch": 3.0, "eval_accuracy": 0.6964766942808663, "eval_flagged/accuracy": 0.8561233656053499, "eval_flagged/f1": 0.8650091304958561, "eval_flagged/precision": 0.9049967341606793, "eval_flagged/recall": 0.8284057277809334, "eval_loss": 0.07808935642242432, "eval_macro_f1": 0.6443586151114484, "eval_macro_precision": 0.7361139772519877, "eval_macro_recall": 0.5968482813911912, "eval_micro_f1": 0.7539058443703921, "eval_micro_precision": 0.8035376504205739, "eval_micro_recall": 0.7100485445525188, "eval_runtime": 87.4927, "eval_samples_per_second": 687.074, "eval_steps_per_second": 5.372, "step": 25362 }, { "epoch": 3.0, "step": 25362, "total_flos": 1.0625604952409506e+19, "train_loss": 0.07807489077325368, "train_runtime": 15386.354, "train_samples_per_second": 105.487, "train_steps_per_second": 1.648 } ], "logging_steps": 10, "max_steps": 25362, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0625604952409506e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }