{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6547167551598866, "eval_steps": 500, "global_step": 8200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007984350672681544, "grad_norm": 3.5232925702802524, "learning_rate": 2.942208207641671e-05, "loss": 0.8424, "step": 10 }, { "epoch": 0.0015968701345363088, "grad_norm": 3.736142434929718, "learning_rate": 3.8279011316305725e-05, "loss": 0.7905, "step": 20 }, { "epoch": 0.0023953052018044632, "grad_norm": 4.955020334614627, "learning_rate": 4.345998279318154e-05, "loss": 0.671, "step": 30 }, { "epoch": 0.0031937402690726175, "grad_norm": 8.588687610785087, "learning_rate": 4.713594055619474e-05, "loss": 0.5332, "step": 40 }, { "epoch": 0.003992175336340772, "grad_norm": 5.482949306795534, "learning_rate": 4.9987234912944383e-05, "loss": 0.3416, "step": 50 }, { "epoch": 0.0047906104036089265, "grad_norm": 6.784951935057385, "learning_rate": 5.231691203307055e-05, "loss": 0.2779, "step": 60 }, { "epoch": 0.005589045470877081, "grad_norm": 5.51385584449277, "learning_rate": 5.428662597233506e-05, "loss": 0.2096, "step": 70 }, { "epoch": 0.006387480538145235, "grad_norm": 4.2244974879162, "learning_rate": 5.599286979608376e-05, "loss": 0.252, "step": 80 }, { "epoch": 0.00718591560541339, "grad_norm": 3.733833668520439, "learning_rate": 5.749788350994636e-05, "loss": 0.1798, "step": 90 }, { "epoch": 0.007984350672681544, "grad_norm": 2.261256472451689, "learning_rate": 5.884416415283342e-05, "loss": 0.1689, "step": 100 }, { "epoch": 0.008782785739949698, "grad_norm": 1.801856803973853, "learning_rate": 6.0062023132921985e-05, "loss": 0.1418, "step": 110 }, { "epoch": 0.009581220807217853, "grad_norm": 5.830051184887845, "learning_rate": 6.117384127295957e-05, "loss": 0.1148, "step": 120 }, { "epoch": 0.010379655874486008, "grad_norm": 0.5938772441566059, "learning_rate": 6.219661481646723e-05, "loss": 0.0991, "step": 130 }, { "epoch": 0.011178090941754162, "grad_norm": 3.3844685803924732, "learning_rate": 6.314355521222407e-05, "loss": 0.121, "step": 140 }, { "epoch": 0.011976526009022317, "grad_norm": 2.6364672052008387, "learning_rate": 6.402513562970922e-05, "loss": 0.0952, "step": 150 }, { "epoch": 0.01277496107629047, "grad_norm": 2.2940383537390376, "learning_rate": 6.484979903597278e-05, "loss": 0.1226, "step": 160 }, { "epoch": 0.013573396143558625, "grad_norm": 2.0903566324645033, "learning_rate": 6.562445123204669e-05, "loss": 0.115, "step": 170 }, { "epoch": 0.01437183121082678, "grad_norm": 1.5992990163675893, "learning_rate": 6.635481274983538e-05, "loss": 0.1142, "step": 180 }, { "epoch": 0.015170266278094934, "grad_norm": 1.1091866267461175, "learning_rate": 6.704567547916425e-05, "loss": 0.0993, "step": 190 }, { "epoch": 0.015968701345363087, "grad_norm": 2.2156336864111146, "learning_rate": 6.770109339272242e-05, "loss": 0.132, "step": 200 }, { "epoch": 0.016767136412631244, "grad_norm": 3.251336662728112, "learning_rate": 6.832452668909989e-05, "loss": 0.0737, "step": 210 }, { "epoch": 0.017565571479899397, "grad_norm": 2.568444919571399, "learning_rate": 6.8918952372811e-05, "loss": 0.0907, "step": 220 }, { "epoch": 0.018364006547167553, "grad_norm": 0.7114489570730391, "learning_rate": 6.948695023346763e-05, "loss": 0.0781, "step": 230 }, { "epoch": 0.019162441614435706, "grad_norm": 2.8086271912993794, "learning_rate": 7.003077051284858e-05, "loss": 0.115, "step": 240 }, { "epoch": 0.01996087668170386, "grad_norm": 1.900134521715012, "learning_rate": 7.055238774947207e-05, "loss": 0.0993, "step": 250 }, { "epoch": 0.020759311748972015, "grad_norm": 0.6776229066938406, "learning_rate": 7.105354405635625e-05, "loss": 0.0948, "step": 260 }, { "epoch": 0.02155774681624017, "grad_norm": 1.8867241168177415, "learning_rate": 7.15357842267112e-05, "loss": 0.077, "step": 270 }, { "epoch": 0.022356181883508325, "grad_norm": 1.2503978176846782, "learning_rate": 7.20004844521131e-05, "loss": 0.0829, "step": 280 }, { "epoch": 0.023154616950776478, "grad_norm": 1.47247849944815, "learning_rate": 7.244887599898725e-05, "loss": 0.0887, "step": 290 }, { "epoch": 0.023953052018044634, "grad_norm": 1.3484244136088377, "learning_rate": 7.288206486959823e-05, "loss": 0.1163, "step": 300 }, { "epoch": 0.024751487085312787, "grad_norm": 7.942468080626284, "learning_rate": 7.330104823803253e-05, "loss": 0.1425, "step": 310 }, { "epoch": 0.02554992215258094, "grad_norm": 4.655099563025342, "learning_rate": 7.37067282758618e-05, "loss": 0.1409, "step": 320 }, { "epoch": 0.026348357219849097, "grad_norm": 1.3927603965707027, "learning_rate": 7.40999238496868e-05, "loss": 0.0887, "step": 330 }, { "epoch": 0.02714679228711725, "grad_norm": 0.6712166946336496, "learning_rate": 7.448138047193572e-05, "loss": 0.0701, "step": 340 }, { "epoch": 0.027945227354385406, "grad_norm": 0.819398704134634, "learning_rate": 7.485177880886274e-05, "loss": 0.064, "step": 350 }, { "epoch": 0.02874366242165356, "grad_norm": 1.5850883570722307, "learning_rate": 7.52117419897244e-05, "loss": 0.1039, "step": 360 }, { "epoch": 0.029542097488921712, "grad_norm": 3.4982586330431267, "learning_rate": 7.556184191429401e-05, "loss": 0.0512, "step": 370 }, { "epoch": 0.03034053255618987, "grad_norm": 1.2751502486060688, "learning_rate": 7.590260471905327e-05, "loss": 0.082, "step": 380 }, { "epoch": 0.03113896762345802, "grad_norm": 1.0821666549195599, "learning_rate": 7.623451553323206e-05, "loss": 0.0838, "step": 390 }, { "epoch": 0.031937402690726174, "grad_norm": 0.3179937672264547, "learning_rate": 7.655802263261145e-05, "loss": 0.0682, "step": 400 }, { "epoch": 0.03273583775799433, "grad_norm": 0.7683525367800101, "learning_rate": 7.687354108034464e-05, "loss": 0.0751, "step": 410 }, { "epoch": 0.03353427282526249, "grad_norm": 1.2004951837928237, "learning_rate": 7.71814559289889e-05, "loss": 0.1034, "step": 420 }, { "epoch": 0.03433270789253064, "grad_norm": 0.8513751742352591, "learning_rate": 7.748212504571692e-05, "loss": 0.1099, "step": 430 }, { "epoch": 0.03513114295979879, "grad_norm": 0.6047209002381277, "learning_rate": 7.777588161270002e-05, "loss": 0.0657, "step": 440 }, { "epoch": 0.03592957802706695, "grad_norm": 1.4997073770835052, "learning_rate": 7.806303634647405e-05, "loss": 0.0663, "step": 450 }, { "epoch": 0.036728013094335106, "grad_norm": 2.450672575042776, "learning_rate": 7.834387947335665e-05, "loss": 0.0764, "step": 460 }, { "epoch": 0.037526448161603256, "grad_norm": 1.5816060633166842, "learning_rate": 7.861868249240195e-05, "loss": 0.0894, "step": 470 }, { "epoch": 0.03832488322887141, "grad_norm": 0.8978231524516874, "learning_rate": 7.888769975273761e-05, "loss": 0.093, "step": 480 }, { "epoch": 0.03912331829613957, "grad_norm": 0.31880545463652393, "learning_rate": 7.91511698682534e-05, "loss": 0.0918, "step": 490 }, { "epoch": 0.03992175336340772, "grad_norm": 0.41662701320676065, "learning_rate": 7.94093169893611e-05, "loss": 0.1022, "step": 500 }, { "epoch": 0.040720188430675874, "grad_norm": 1.6604197089055341, "learning_rate": 7.966235194881151e-05, "loss": 0.0577, "step": 510 }, { "epoch": 0.04151862349794403, "grad_norm": 0.5952537112068932, "learning_rate": 7.991047329624525e-05, "loss": 0.0824, "step": 520 }, { "epoch": 0.04231705856521219, "grad_norm": 1.057400098552263, "learning_rate": 8.01538682341959e-05, "loss": 0.0635, "step": 530 }, { "epoch": 0.04311549363248034, "grad_norm": 1.4367457504094763, "learning_rate": 8.039271346660021e-05, "loss": 0.0749, "step": 540 }, { "epoch": 0.04391392869974849, "grad_norm": 1.4304249110893281, "learning_rate": 8.062717596944966e-05, "loss": 0.083, "step": 550 }, { "epoch": 0.04471236376701665, "grad_norm": 0.7235154604788787, "learning_rate": 8.085741369200211e-05, "loss": 0.0688, "step": 560 }, { "epoch": 0.0455107988342848, "grad_norm": 0.3530930527250422, "learning_rate": 8.108357619592909e-05, "loss": 0.0929, "step": 570 }, { "epoch": 0.046309233901552956, "grad_norm": 0.9718806338973844, "learning_rate": 8.130580523887628e-05, "loss": 0.0553, "step": 580 }, { "epoch": 0.04710766896882111, "grad_norm": 1.2335130967238364, "learning_rate": 8.15242353081395e-05, "loss": 0.1021, "step": 590 }, { "epoch": 0.04790610403608927, "grad_norm": 0.3228008866221465, "learning_rate": 8.173899410948726e-05, "loss": 0.0791, "step": 600 }, { "epoch": 0.04870453910335742, "grad_norm": 1.3015629710425247, "learning_rate": 8.195020301557898e-05, "loss": 0.0692, "step": 610 }, { "epoch": 0.049502974170625574, "grad_norm": 0.6936581823866799, "learning_rate": 8.215797747792154e-05, "loss": 0.091, "step": 620 }, { "epoch": 0.05030140923789373, "grad_norm": 2.046468793339183, "learning_rate": 8.236242740586472e-05, "loss": 0.0785, "step": 630 }, { "epoch": 0.05109984430516188, "grad_norm": 4.620875624681255, "learning_rate": 8.256365751575082e-05, "loss": 0.1094, "step": 640 }, { "epoch": 0.05189827937243004, "grad_norm": 1.0255647708402473, "learning_rate": 8.27617676529949e-05, "loss": 0.0535, "step": 650 }, { "epoch": 0.05269671443969819, "grad_norm": 0.4616344444526033, "learning_rate": 8.295685308957582e-05, "loss": 0.0787, "step": 660 }, { "epoch": 0.05349514950696634, "grad_norm": 1.3330176172712866, "learning_rate": 8.314900479915685e-05, "loss": 0.0914, "step": 670 }, { "epoch": 0.0542935845742345, "grad_norm": 1.126229916038408, "learning_rate": 8.333830971182472e-05, "loss": 0.1071, "step": 680 }, { "epoch": 0.055092019641502656, "grad_norm": 0.9532218500993943, "learning_rate": 8.352485095023246e-05, "loss": 0.0629, "step": 690 }, { "epoch": 0.05589045470877081, "grad_norm": 1.2383873083152779, "learning_rate": 8.370870804875176e-05, "loss": 0.1098, "step": 700 }, { "epoch": 0.05668888977603896, "grad_norm": 0.9810020490409863, "learning_rate": 8.388995715708101e-05, "loss": 0.0736, "step": 710 }, { "epoch": 0.05748732484330712, "grad_norm": 0.3971929526776789, "learning_rate": 8.406867122961343e-05, "loss": 0.0773, "step": 720 }, { "epoch": 0.058285759910575274, "grad_norm": 0.7108087299273486, "learning_rate": 8.424492020174429e-05, "loss": 0.1098, "step": 730 }, { "epoch": 0.059084194977843424, "grad_norm": 2.2341124836718995, "learning_rate": 8.441877115418304e-05, "loss": 0.0813, "step": 740 }, { "epoch": 0.05988263004511158, "grad_norm": 0.38550605242633584, "learning_rate": 8.45902884662369e-05, "loss": 0.0715, "step": 750 }, { "epoch": 0.06068106511237974, "grad_norm": 0.4335701502840811, "learning_rate": 8.47595339589423e-05, "loss": 0.076, "step": 760 }, { "epoch": 0.06147950017964789, "grad_norm": 0.4115317244701979, "learning_rate": 8.492656702884034e-05, "loss": 0.0953, "step": 770 }, { "epoch": 0.06227793524691604, "grad_norm": 1.840713686032294, "learning_rate": 8.509144477312107e-05, "loss": 0.0764, "step": 780 }, { "epoch": 0.0630763703141842, "grad_norm": 1.4905547592936923, "learning_rate": 8.525422210679596e-05, "loss": 0.0725, "step": 790 }, { "epoch": 0.06387480538145235, "grad_norm": 0.5996295060118539, "learning_rate": 8.541495187250046e-05, "loss": 0.0769, "step": 800 }, { "epoch": 0.06467324044872051, "grad_norm": 1.3404150641883488, "learning_rate": 8.557368494347602e-05, "loss": 0.0821, "step": 810 }, { "epoch": 0.06547167551598866, "grad_norm": 0.4748718025375387, "learning_rate": 8.573047032023365e-05, "loss": 0.0874, "step": 820 }, { "epoch": 0.06627011058325681, "grad_norm": 0.7725716996053142, "learning_rate": 8.588535522135874e-05, "loss": 0.0772, "step": 830 }, { "epoch": 0.06706854565052497, "grad_norm": 0.822250642719596, "learning_rate": 8.603838516887792e-05, "loss": 0.0606, "step": 840 }, { "epoch": 0.06786698071779312, "grad_norm": 0.6797537145210232, "learning_rate": 8.618960406857437e-05, "loss": 0.0587, "step": 850 }, { "epoch": 0.06866541578506127, "grad_norm": 0.3442856654043566, "learning_rate": 8.633905428560595e-05, "loss": 0.0791, "step": 860 }, { "epoch": 0.06946385085232944, "grad_norm": 1.9321932414949274, "learning_rate": 8.648677671575208e-05, "loss": 0.1101, "step": 870 }, { "epoch": 0.07026228591959759, "grad_norm": 0.8091988266382711, "learning_rate": 8.663281085258905e-05, "loss": 0.0935, "step": 880 }, { "epoch": 0.07106072098686575, "grad_norm": 0.9736053752084688, "learning_rate": 8.677719485086984e-05, "loss": 0.0892, "step": 890 }, { "epoch": 0.0718591560541339, "grad_norm": 0.8209361814313335, "learning_rate": 8.691996558636308e-05, "loss": 0.0737, "step": 900 }, { "epoch": 0.07265759112140205, "grad_norm": 0.4035267647165605, "learning_rate": 8.706115871238559e-05, "loss": 0.0553, "step": 910 }, { "epoch": 0.07345602618867021, "grad_norm": 0.7616589733118284, "learning_rate": 8.720080871324567e-05, "loss": 0.0615, "step": 920 }, { "epoch": 0.07425446125593836, "grad_norm": 3.3729080844699326, "learning_rate": 8.733894895479733e-05, "loss": 0.0819, "step": 930 }, { "epoch": 0.07505289632320651, "grad_norm": 0.15929664401491783, "learning_rate": 8.747561173229096e-05, "loss": 0.0701, "step": 940 }, { "epoch": 0.07585133139047467, "grad_norm": 0.884846200272443, "learning_rate": 8.761082831569194e-05, "loss": 0.0846, "step": 950 }, { "epoch": 0.07664976645774282, "grad_norm": 0.4976508744267154, "learning_rate": 8.774462899262663e-05, "loss": 0.1038, "step": 960 }, { "epoch": 0.07744820152501097, "grad_norm": 2.4405255315055334, "learning_rate": 8.787704310910292e-05, "loss": 0.083, "step": 970 }, { "epoch": 0.07824663659227914, "grad_norm": 0.5928086171271278, "learning_rate": 8.800809910814243e-05, "loss": 0.0613, "step": 980 }, { "epoch": 0.07904507165954729, "grad_norm": 0.7380085702017415, "learning_rate": 8.813782456645164e-05, "loss": 0.0741, "step": 990 }, { "epoch": 0.07984350672681544, "grad_norm": 0.8908559197171063, "learning_rate": 8.826624622925012e-05, "loss": 0.0609, "step": 1000 }, { "epoch": 0.0806419417940836, "grad_norm": 1.3924647259072407, "learning_rate": 8.83933900433659e-05, "loss": 0.0804, "step": 1010 }, { "epoch": 0.08144037686135175, "grad_norm": 0.6744120448203766, "learning_rate": 8.851928118870054e-05, "loss": 0.0782, "step": 1020 }, { "epoch": 0.0822388119286199, "grad_norm": 1.2461058480859704, "learning_rate": 8.86439441081591e-05, "loss": 0.0626, "step": 1030 }, { "epoch": 0.08303724699588806, "grad_norm": 2.467688787000267, "learning_rate": 8.876740253613428e-05, "loss": 0.087, "step": 1040 }, { "epoch": 0.08383568206315621, "grad_norm": 0.6757524264432309, "learning_rate": 8.888967952562756e-05, "loss": 0.086, "step": 1050 }, { "epoch": 0.08463411713042437, "grad_norm": 2.165699518864385, "learning_rate": 8.901079747408493e-05, "loss": 0.0733, "step": 1060 }, { "epoch": 0.08543255219769252, "grad_norm": 0.3953210869404968, "learning_rate": 8.913077814801954e-05, "loss": 0.0962, "step": 1070 }, { "epoch": 0.08623098726496067, "grad_norm": 0.5296065235489941, "learning_rate": 8.924964270648924e-05, "loss": 0.06, "step": 1080 }, { "epoch": 0.08702942233222884, "grad_norm": 0.20038342872971074, "learning_rate": 8.936741172349198e-05, "loss": 0.0775, "step": 1090 }, { "epoch": 0.08782785739949699, "grad_norm": 0.8356649350481642, "learning_rate": 8.948410520933869e-05, "loss": 0.0868, "step": 1100 }, { "epoch": 0.08862629246676514, "grad_norm": 1.0589224823029895, "learning_rate": 8.959974263105884e-05, "loss": 0.0619, "step": 1110 }, { "epoch": 0.0894247275340333, "grad_norm": 0.7960185516540815, "learning_rate": 8.971434293189114e-05, "loss": 0.0576, "step": 1120 }, { "epoch": 0.09022316260130145, "grad_norm": 1.2994761751707133, "learning_rate": 8.982792454990772e-05, "loss": 0.0812, "step": 1130 }, { "epoch": 0.0910215976685696, "grad_norm": 0.822292570186698, "learning_rate": 8.99405054358181e-05, "loss": 0.0691, "step": 1140 }, { "epoch": 0.09182003273583776, "grad_norm": 1.241640377765173, "learning_rate": 9.005210306999533e-05, "loss": 0.0619, "step": 1150 }, { "epoch": 0.09261846780310591, "grad_norm": 0.7758797635010947, "learning_rate": 9.01627344787653e-05, "loss": 0.0922, "step": 1160 }, { "epoch": 0.09341690287037406, "grad_norm": 1.104880691096772, "learning_rate": 9.027241624999689e-05, "loss": 0.0981, "step": 1170 }, { "epoch": 0.09421533793764222, "grad_norm": 0.10918433398556385, "learning_rate": 9.038116454802852e-05, "loss": 0.0612, "step": 1180 }, { "epoch": 0.09501377300491037, "grad_norm": 0.5247380213024838, "learning_rate": 9.048899512796504e-05, "loss": 0.0687, "step": 1190 }, { "epoch": 0.09581220807217854, "grad_norm": 0.5310899070310212, "learning_rate": 9.059592334937628e-05, "loss": 0.0725, "step": 1200 }, { "epoch": 0.09661064313944669, "grad_norm": 0.13475686039340207, "learning_rate": 9.070196418942726e-05, "loss": 0.0586, "step": 1210 }, { "epoch": 0.09740907820671484, "grad_norm": 0.24722920106750101, "learning_rate": 9.080713225546801e-05, "loss": 0.0704, "step": 1220 }, { "epoch": 0.098207513273983, "grad_norm": 1.8397525090103166, "learning_rate": 9.091144179710946e-05, "loss": 0.0792, "step": 1230 }, { "epoch": 0.09900594834125115, "grad_norm": 0.5542485878904292, "learning_rate": 9.101490671781055e-05, "loss": 0.0967, "step": 1240 }, { "epoch": 0.0998043834085193, "grad_norm": 0.3583983335842228, "learning_rate": 9.111754058599977e-05, "loss": 0.0608, "step": 1250 }, { "epoch": 0.10060281847578746, "grad_norm": 1.0065124160311716, "learning_rate": 9.121935664575374e-05, "loss": 0.1035, "step": 1260 }, { "epoch": 0.10140125354305561, "grad_norm": 0.4666588580387682, "learning_rate": 9.132036782705373e-05, "loss": 0.0687, "step": 1270 }, { "epoch": 0.10219968861032376, "grad_norm": 0.3623496777731359, "learning_rate": 9.142058675563983e-05, "loss": 0.0732, "step": 1280 }, { "epoch": 0.10299812367759192, "grad_norm": 0.26488301439731154, "learning_rate": 9.152002576248177e-05, "loss": 0.1041, "step": 1290 }, { "epoch": 0.10379655874486007, "grad_norm": 0.1888961199003232, "learning_rate": 9.161869689288392e-05, "loss": 0.0577, "step": 1300 }, { "epoch": 0.10459499381212822, "grad_norm": 0.24802960871773072, "learning_rate": 9.171661191524173e-05, "loss": 0.0592, "step": 1310 }, { "epoch": 0.10539342887939639, "grad_norm": 0.8859054279177876, "learning_rate": 9.181378232946485e-05, "loss": 0.0737, "step": 1320 }, { "epoch": 0.10619186394666454, "grad_norm": 0.6342449000208032, "learning_rate": 9.191021937508261e-05, "loss": 0.07, "step": 1330 }, { "epoch": 0.10699029901393269, "grad_norm": 0.19646033159322437, "learning_rate": 9.200593403904588e-05, "loss": 0.0775, "step": 1340 }, { "epoch": 0.10778873408120085, "grad_norm": 0.23174924542751033, "learning_rate": 9.210093706323888e-05, "loss": 0.0823, "step": 1350 }, { "epoch": 0.108587169148469, "grad_norm": 0.5011075581503674, "learning_rate": 9.219523895171373e-05, "loss": 0.0671, "step": 1360 }, { "epoch": 0.10938560421573716, "grad_norm": 0.936119632385534, "learning_rate": 9.228884997766016e-05, "loss": 0.0592, "step": 1370 }, { "epoch": 0.11018403928300531, "grad_norm": 0.7797368432433808, "learning_rate": 9.238178019012147e-05, "loss": 0.0647, "step": 1380 }, { "epoch": 0.11098247435027346, "grad_norm": 0.6468052400331983, "learning_rate": 9.247403942046845e-05, "loss": 0.095, "step": 1390 }, { "epoch": 0.11178090941754162, "grad_norm": 1.5790945386597242, "learning_rate": 9.256563728864078e-05, "loss": 0.0785, "step": 1400 }, { "epoch": 0.11257934448480977, "grad_norm": 0.48706241804156847, "learning_rate": 9.265658320916678e-05, "loss": 0.074, "step": 1410 }, { "epoch": 0.11337777955207792, "grad_norm": 0.6031988051095882, "learning_rate": 9.274688639697002e-05, "loss": 0.1009, "step": 1420 }, { "epoch": 0.11417621461934609, "grad_norm": 0.5190122244621026, "learning_rate": 9.283655587297249e-05, "loss": 0.0843, "step": 1430 }, { "epoch": 0.11497464968661424, "grad_norm": 0.9512689902993662, "learning_rate": 9.292560046950244e-05, "loss": 0.0715, "step": 1440 }, { "epoch": 0.11577308475388239, "grad_norm": 0.40864222618071416, "learning_rate": 9.301402883551495e-05, "loss": 0.0755, "step": 1450 }, { "epoch": 0.11657151982115055, "grad_norm": 0.7769625588538747, "learning_rate": 9.31018494416333e-05, "loss": 0.0757, "step": 1460 }, { "epoch": 0.1173699548884187, "grad_norm": 0.3969001840652321, "learning_rate": 9.318907058501824e-05, "loss": 0.066, "step": 1470 }, { "epoch": 0.11816838995568685, "grad_norm": 1.0891536238630175, "learning_rate": 9.327570039407205e-05, "loss": 0.0685, "step": 1480 }, { "epoch": 0.11896682502295501, "grad_norm": 0.7825998513796186, "learning_rate": 9.336174683298437e-05, "loss": 0.068, "step": 1490 }, { "epoch": 0.11976526009022316, "grad_norm": 0.6119434023978193, "learning_rate": 9.344721770612593e-05, "loss": 0.0708, "step": 1500 }, { "epoch": 0.12056369515749131, "grad_norm": 0.6392505815350028, "learning_rate": 9.353212066229625e-05, "loss": 0.0696, "step": 1510 }, { "epoch": 0.12136213022475947, "grad_norm": 0.23794947831500443, "learning_rate": 9.361646319883132e-05, "loss": 0.081, "step": 1520 }, { "epoch": 0.12216056529202762, "grad_norm": 0.6909279964700517, "learning_rate": 9.370025266557636e-05, "loss": 0.0718, "step": 1530 }, { "epoch": 0.12295900035929579, "grad_norm": 1.1048882763470222, "learning_rate": 9.378349626872935e-05, "loss": 0.0788, "step": 1540 }, { "epoch": 0.12375743542656394, "grad_norm": 0.12156789791117421, "learning_rate": 9.38662010745602e-05, "loss": 0.0814, "step": 1550 }, { "epoch": 0.12455587049383209, "grad_norm": 2.026304704972604, "learning_rate": 9.39483740130101e-05, "loss": 0.0921, "step": 1560 }, { "epoch": 0.12535430556110025, "grad_norm": 2.025916307932843, "learning_rate": 9.403002188117609e-05, "loss": 0.058, "step": 1570 }, { "epoch": 0.1261527406283684, "grad_norm": 0.21178485251660298, "learning_rate": 9.411115134668499e-05, "loss": 0.0598, "step": 1580 }, { "epoch": 0.12695117569563655, "grad_norm": 1.4284181500108823, "learning_rate": 9.419176895096073e-05, "loss": 0.0814, "step": 1590 }, { "epoch": 0.1277496107629047, "grad_norm": 0.7843104603155183, "learning_rate": 9.427188111238948e-05, "loss": 0.042, "step": 1600 }, { "epoch": 0.12854804583017287, "grad_norm": 1.2008220942804122, "learning_rate": 9.435149412938599e-05, "loss": 0.0901, "step": 1610 }, { "epoch": 0.12934648089744102, "grad_norm": 0.33193358515873145, "learning_rate": 9.443061418336504e-05, "loss": 0.06, "step": 1620 }, { "epoch": 0.13014491596470917, "grad_norm": 0.5099792717245282, "learning_rate": 9.45092473416216e-05, "loss": 0.0561, "step": 1630 }, { "epoch": 0.13094335103197732, "grad_norm": 0.5997349133416637, "learning_rate": 9.458739956012268e-05, "loss": 0.0681, "step": 1640 }, { "epoch": 0.13174178609924547, "grad_norm": 0.2113301672959503, "learning_rate": 9.46650766862145e-05, "loss": 0.0752, "step": 1650 }, { "epoch": 0.13254022116651362, "grad_norm": 0.5787929964170312, "learning_rate": 9.474228446124777e-05, "loss": 0.0656, "step": 1660 }, { "epoch": 0.1333386562337818, "grad_norm": 0.8127539428562134, "learning_rate": 9.48190285231242e-05, "loss": 0.0899, "step": 1670 }, { "epoch": 0.13413709130104995, "grad_norm": 0.7650823029213837, "learning_rate": 9.489531440876694e-05, "loss": 0.0651, "step": 1680 }, { "epoch": 0.1349355263683181, "grad_norm": 0.28754919982145066, "learning_rate": 9.497114755651775e-05, "loss": 0.0847, "step": 1690 }, { "epoch": 0.13573396143558625, "grad_norm": 0.13862826523964283, "learning_rate": 9.504653330846339e-05, "loss": 0.0347, "step": 1700 }, { "epoch": 0.1365323965028544, "grad_norm": 0.5268722016235668, "learning_rate": 9.51214769126939e-05, "loss": 0.0656, "step": 1710 }, { "epoch": 0.13733083157012255, "grad_norm": 0.5178142175719089, "learning_rate": 9.519598352549497e-05, "loss": 0.0395, "step": 1720 }, { "epoch": 0.13812926663739072, "grad_norm": 0.6866453511139639, "learning_rate": 9.527005821347668e-05, "loss": 0.0714, "step": 1730 }, { "epoch": 0.13892770170465887, "grad_norm": 0.259840861988313, "learning_rate": 9.534370595564111e-05, "loss": 0.0561, "step": 1740 }, { "epoch": 0.13972613677192702, "grad_norm": 1.3308950214123239, "learning_rate": 9.541693164539043e-05, "loss": 0.0781, "step": 1750 }, { "epoch": 0.14052457183919517, "grad_norm": 0.349179761935008, "learning_rate": 9.548974009247806e-05, "loss": 0.0638, "step": 1760 }, { "epoch": 0.14132300690646332, "grad_norm": 1.4789155167397658, "learning_rate": 9.556213602490433e-05, "loss": 0.0521, "step": 1770 }, { "epoch": 0.1421214419737315, "grad_norm": 0.5314115346714045, "learning_rate": 9.563412409075885e-05, "loss": 0.0453, "step": 1780 }, { "epoch": 0.14291987704099965, "grad_norm": 0.7719157195860719, "learning_rate": 9.570570886001126e-05, "loss": 0.0576, "step": 1790 }, { "epoch": 0.1437183121082678, "grad_norm": 0.5365584658316398, "learning_rate": 9.577689482625209e-05, "loss": 0.0884, "step": 1800 }, { "epoch": 0.14451674717553595, "grad_norm": 0.40980247060477115, "learning_rate": 9.584768640838534e-05, "loss": 0.0633, "step": 1810 }, { "epoch": 0.1453151822428041, "grad_norm": 1.0230342101543866, "learning_rate": 9.59180879522746e-05, "loss": 0.072, "step": 1820 }, { "epoch": 0.14611361731007225, "grad_norm": 0.37391519429618736, "learning_rate": 9.598810373234382e-05, "loss": 0.062, "step": 1830 }, { "epoch": 0.14691205237734042, "grad_norm": 0.5546899786228485, "learning_rate": 9.605773795313468e-05, "loss": 0.0811, "step": 1840 }, { "epoch": 0.14771048744460857, "grad_norm": 0.5121481165129823, "learning_rate": 9.61269947508217e-05, "loss": 0.0709, "step": 1850 }, { "epoch": 0.14850892251187672, "grad_norm": 0.3968585153073068, "learning_rate": 9.619587819468636e-05, "loss": 0.0753, "step": 1860 }, { "epoch": 0.14930735757914487, "grad_norm": 0.830639936616994, "learning_rate": 9.626439228855197e-05, "loss": 0.0684, "step": 1870 }, { "epoch": 0.15010579264641302, "grad_norm": 0.2864308337848012, "learning_rate": 9.633254097217999e-05, "loss": 0.0595, "step": 1880 }, { "epoch": 0.15090422771368117, "grad_norm": 0.34372011057055735, "learning_rate": 9.640032812262954e-05, "loss": 0.0851, "step": 1890 }, { "epoch": 0.15170266278094935, "grad_norm": 0.7457036317344385, "learning_rate": 9.646775755558097e-05, "loss": 0.0537, "step": 1900 }, { "epoch": 0.1525010978482175, "grad_norm": 0.5523782556900674, "learning_rate": 9.653483302662452e-05, "loss": 0.0942, "step": 1910 }, { "epoch": 0.15329953291548565, "grad_norm": 0.2828028287146034, "learning_rate": 9.660155823251565e-05, "loss": 0.068, "step": 1920 }, { "epoch": 0.1540979679827538, "grad_norm": 0.7231361664577421, "learning_rate": 9.666793681239751e-05, "loss": 0.0645, "step": 1930 }, { "epoch": 0.15489640305002195, "grad_norm": 0.7844028595228837, "learning_rate": 9.673397234899194e-05, "loss": 0.0749, "step": 1940 }, { "epoch": 0.15569483811729012, "grad_norm": 1.0478767531894126, "learning_rate": 9.679966836975974e-05, "loss": 0.1157, "step": 1950 }, { "epoch": 0.15649327318455827, "grad_norm": 0.41201473500664804, "learning_rate": 9.686502834803144e-05, "loss": 0.0799, "step": 1960 }, { "epoch": 0.15729170825182642, "grad_norm": 0.47662392382424124, "learning_rate": 9.69300557041092e-05, "loss": 0.0657, "step": 1970 }, { "epoch": 0.15809014331909457, "grad_norm": 0.22078628822978255, "learning_rate": 9.699475380634067e-05, "loss": 0.057, "step": 1980 }, { "epoch": 0.15888857838636272, "grad_norm": 0.5959337410821844, "learning_rate": 9.705912597216614e-05, "loss": 0.0651, "step": 1990 }, { "epoch": 0.15968701345363087, "grad_norm": 0.7035564380493872, "learning_rate": 9.712317546913912e-05, "loss": 0.0659, "step": 2000 }, { "epoch": 0.16048544852089905, "grad_norm": 0.38778113345150145, "learning_rate": 9.718690551592169e-05, "loss": 0.0927, "step": 2010 }, { "epoch": 0.1612838835881672, "grad_norm": 0.3486784675066523, "learning_rate": 9.725031928325491e-05, "loss": 0.0716, "step": 2020 }, { "epoch": 0.16208231865543535, "grad_norm": 0.6303310068494262, "learning_rate": 9.731341989490561e-05, "loss": 0.078, "step": 2030 }, { "epoch": 0.1628807537227035, "grad_norm": 1.4877039883169711, "learning_rate": 9.737621042858955e-05, "loss": 0.0866, "step": 2040 }, { "epoch": 0.16367918878997165, "grad_norm": 0.7954321278315749, "learning_rate": 9.743869391687233e-05, "loss": 0.0798, "step": 2050 }, { "epoch": 0.1644776238572398, "grad_norm": 0.28805128141498443, "learning_rate": 9.750087334804812e-05, "loss": 0.0702, "step": 2060 }, { "epoch": 0.16527605892450797, "grad_norm": 0.6762487117705422, "learning_rate": 9.75627516669973e-05, "loss": 0.0602, "step": 2070 }, { "epoch": 0.16607449399177612, "grad_norm": 0.5536331829158825, "learning_rate": 9.76243317760233e-05, "loss": 0.0459, "step": 2080 }, { "epoch": 0.16687292905904427, "grad_norm": 0.33803981510789505, "learning_rate": 9.768561653566953e-05, "loss": 0.0667, "step": 2090 }, { "epoch": 0.16767136412631242, "grad_norm": 0.19271271530476722, "learning_rate": 9.774660876551659e-05, "loss": 0.0614, "step": 2100 }, { "epoch": 0.16846979919358057, "grad_norm": 0.8927328638984207, "learning_rate": 9.780731124496076e-05, "loss": 0.0767, "step": 2110 }, { "epoch": 0.16926823426084875, "grad_norm": 0.5298297843276258, "learning_rate": 9.786772671397395e-05, "loss": 0.0629, "step": 2120 }, { "epoch": 0.1700666693281169, "grad_norm": 0.30875418229193075, "learning_rate": 9.792785787384581e-05, "loss": 0.0334, "step": 2130 }, { "epoch": 0.17086510439538505, "grad_norm": 0.4603898414224521, "learning_rate": 9.798770738790855e-05, "loss": 0.0668, "step": 2140 }, { "epoch": 0.1716635394626532, "grad_norm": 0.17333943418800754, "learning_rate": 9.804727788224462e-05, "loss": 0.0511, "step": 2150 }, { "epoch": 0.17246197452992135, "grad_norm": 0.7779994729637698, "learning_rate": 9.810657194637825e-05, "loss": 0.0535, "step": 2160 }, { "epoch": 0.1732604095971895, "grad_norm": 0.32995926154524996, "learning_rate": 9.816559213395087e-05, "loss": 0.0908, "step": 2170 }, { "epoch": 0.17405884466445767, "grad_norm": 1.453741519995499, "learning_rate": 9.8224340963381e-05, "loss": 0.0908, "step": 2180 }, { "epoch": 0.17485727973172582, "grad_norm": 1.8378453173299512, "learning_rate": 9.82828209185091e-05, "loss": 0.1127, "step": 2190 }, { "epoch": 0.17565571479899397, "grad_norm": 0.48725273756729015, "learning_rate": 9.83410344492277e-05, "loss": 0.0645, "step": 2200 }, { "epoch": 0.17645414986626212, "grad_norm": 0.22593484396831762, "learning_rate": 9.83989839720972e-05, "loss": 0.0517, "step": 2210 }, { "epoch": 0.17725258493353027, "grad_norm": 0.33118665534159303, "learning_rate": 9.845667187094785e-05, "loss": 0.0536, "step": 2220 }, { "epoch": 0.17805102000079845, "grad_norm": 0.39066819449354656, "learning_rate": 9.851410049746818e-05, "loss": 0.0777, "step": 2230 }, { "epoch": 0.1788494550680666, "grad_norm": 0.8485083305429001, "learning_rate": 9.857127217178015e-05, "loss": 0.0697, "step": 2240 }, { "epoch": 0.17964789013533475, "grad_norm": 0.19952539466987482, "learning_rate": 9.862818918300173e-05, "loss": 0.0451, "step": 2250 }, { "epoch": 0.1804463252026029, "grad_norm": 0.691252616561084, "learning_rate": 9.868485378979675e-05, "loss": 0.0647, "step": 2260 }, { "epoch": 0.18124476026987105, "grad_norm": 0.7049940159011623, "learning_rate": 9.874126822091277e-05, "loss": 0.0845, "step": 2270 }, { "epoch": 0.1820431953371392, "grad_norm": 1.2133382620511355, "learning_rate": 9.879743467570711e-05, "loss": 0.0588, "step": 2280 }, { "epoch": 0.18284163040440737, "grad_norm": 0.35602520819498223, "learning_rate": 9.885335532466129e-05, "loss": 0.0742, "step": 2290 }, { "epoch": 0.18364006547167552, "grad_norm": 0.4158085994828218, "learning_rate": 9.890903230988434e-05, "loss": 0.0808, "step": 2300 }, { "epoch": 0.18443850053894367, "grad_norm": 0.18046190200012216, "learning_rate": 9.896446774560516e-05, "loss": 0.0503, "step": 2310 }, { "epoch": 0.18523693560621182, "grad_norm": 0.22406543693610548, "learning_rate": 9.90196637186543e-05, "loss": 0.0657, "step": 2320 }, { "epoch": 0.18603537067347997, "grad_norm": 0.4288290023830769, "learning_rate": 9.907462228893529e-05, "loss": 0.0552, "step": 2330 }, { "epoch": 0.18683380574074812, "grad_norm": 0.385521256661968, "learning_rate": 9.912934548988589e-05, "loss": 0.0399, "step": 2340 }, { "epoch": 0.1876322408080163, "grad_norm": 0.7416803203530228, "learning_rate": 9.918383532892963e-05, "loss": 0.0556, "step": 2350 }, { "epoch": 0.18843067587528445, "grad_norm": 0.40622202711719285, "learning_rate": 9.923809378791754e-05, "loss": 0.0685, "step": 2360 }, { "epoch": 0.1892291109425526, "grad_norm": 0.8021095669829111, "learning_rate": 9.92921228235608e-05, "loss": 0.0448, "step": 2370 }, { "epoch": 0.19002754600982075, "grad_norm": 0.2868854139454059, "learning_rate": 9.934592436785405e-05, "loss": 0.0476, "step": 2380 }, { "epoch": 0.1908259810770889, "grad_norm": 1.2605357558943955, "learning_rate": 9.939950032849001e-05, "loss": 0.051, "step": 2390 }, { "epoch": 0.19162441614435707, "grad_norm": 1.073063002530671, "learning_rate": 9.94528525892653e-05, "loss": 0.0759, "step": 2400 }, { "epoch": 0.19242285121162522, "grad_norm": 0.1347432671200069, "learning_rate": 9.950598301047787e-05, "loss": 0.0657, "step": 2410 }, { "epoch": 0.19322128627889337, "grad_norm": 0.48182010739489867, "learning_rate": 9.955889342931627e-05, "loss": 0.0589, "step": 2420 }, { "epoch": 0.19401972134616152, "grad_norm": 0.6296388960353804, "learning_rate": 9.961158566024085e-05, "loss": 0.0714, "step": 2430 }, { "epoch": 0.19481815641342967, "grad_norm": 0.33729264945412785, "learning_rate": 9.966406149535702e-05, "loss": 0.0479, "step": 2440 }, { "epoch": 0.19561659148069782, "grad_norm": 0.1498735604114689, "learning_rate": 9.97163227047811e-05, "loss": 0.0498, "step": 2450 }, { "epoch": 0.196415026547966, "grad_norm": 1.0199520439535377, "learning_rate": 9.976837103699849e-05, "loss": 0.1106, "step": 2460 }, { "epoch": 0.19721346161523415, "grad_norm": 0.29763329077307044, "learning_rate": 9.982020821921477e-05, "loss": 0.0692, "step": 2470 }, { "epoch": 0.1980118966825023, "grad_norm": 1.047960512760466, "learning_rate": 9.987183595769957e-05, "loss": 0.0876, "step": 2480 }, { "epoch": 0.19881033174977045, "grad_norm": 1.7136728132679582, "learning_rate": 9.992325593812358e-05, "loss": 0.1053, "step": 2490 }, { "epoch": 0.1996087668170386, "grad_norm": 0.2024671443429759, "learning_rate": 9.997446982588877e-05, "loss": 0.0713, "step": 2500 }, { "epoch": 0.20040720188430675, "grad_norm": 0.8635214144009746, "learning_rate": 9.998225613272413e-05, "loss": 0.0753, "step": 2510 }, { "epoch": 0.20120563695157492, "grad_norm": 1.1726579945772015, "learning_rate": 9.993789646453446e-05, "loss": 0.0464, "step": 2520 }, { "epoch": 0.20200407201884307, "grad_norm": 0.5156462324013295, "learning_rate": 9.989353679634476e-05, "loss": 0.0641, "step": 2530 }, { "epoch": 0.20280250708611122, "grad_norm": 0.4818857730271147, "learning_rate": 9.984917712815509e-05, "loss": 0.0774, "step": 2540 }, { "epoch": 0.20360094215337937, "grad_norm": 0.5932059075223263, "learning_rate": 9.980481745996541e-05, "loss": 0.0765, "step": 2550 }, { "epoch": 0.20439937722064752, "grad_norm": 0.5806911480243303, "learning_rate": 9.976045779177571e-05, "loss": 0.0792, "step": 2560 }, { "epoch": 0.2051978122879157, "grad_norm": 0.767705201792789, "learning_rate": 9.971609812358604e-05, "loss": 0.0666, "step": 2570 }, { "epoch": 0.20599624735518385, "grad_norm": 0.2993239339384049, "learning_rate": 9.967173845539636e-05, "loss": 0.0739, "step": 2580 }, { "epoch": 0.206794682422452, "grad_norm": 0.347192548507271, "learning_rate": 9.962737878720667e-05, "loss": 0.0832, "step": 2590 }, { "epoch": 0.20759311748972015, "grad_norm": 0.4651167628767381, "learning_rate": 9.9583019119017e-05, "loss": 0.0813, "step": 2600 }, { "epoch": 0.2083915525569883, "grad_norm": 0.5106958424622942, "learning_rate": 9.953865945082731e-05, "loss": 0.0887, "step": 2610 }, { "epoch": 0.20918998762425645, "grad_norm": 0.35739379245303143, "learning_rate": 9.949429978263763e-05, "loss": 0.0644, "step": 2620 }, { "epoch": 0.20998842269152462, "grad_norm": 1.0125482686281093, "learning_rate": 9.944994011444795e-05, "loss": 0.06, "step": 2630 }, { "epoch": 0.21078685775879277, "grad_norm": 0.3451023665574518, "learning_rate": 9.940558044625827e-05, "loss": 0.0867, "step": 2640 }, { "epoch": 0.21158529282606092, "grad_norm": 0.6950882876363752, "learning_rate": 9.936122077806858e-05, "loss": 0.0692, "step": 2650 }, { "epoch": 0.21238372789332907, "grad_norm": 0.6273149132955865, "learning_rate": 9.93168611098789e-05, "loss": 0.0653, "step": 2660 }, { "epoch": 0.21318216296059722, "grad_norm": 0.8797100572687333, "learning_rate": 9.927250144168923e-05, "loss": 0.0862, "step": 2670 }, { "epoch": 0.21398059802786537, "grad_norm": 0.3022352149028631, "learning_rate": 9.922814177349954e-05, "loss": 0.0388, "step": 2680 }, { "epoch": 0.21477903309513355, "grad_norm": 0.46577150224023545, "learning_rate": 9.918378210530985e-05, "loss": 0.0907, "step": 2690 }, { "epoch": 0.2155774681624017, "grad_norm": 0.40185970587390235, "learning_rate": 9.913942243712018e-05, "loss": 0.0664, "step": 2700 }, { "epoch": 0.21637590322966985, "grad_norm": 0.7032122014906631, "learning_rate": 9.909506276893049e-05, "loss": 0.0556, "step": 2710 }, { "epoch": 0.217174338296938, "grad_norm": 0.20054634591660722, "learning_rate": 9.905070310074082e-05, "loss": 0.053, "step": 2720 }, { "epoch": 0.21797277336420615, "grad_norm": 0.8214860525138075, "learning_rate": 9.900634343255114e-05, "loss": 0.0629, "step": 2730 }, { "epoch": 0.21877120843147432, "grad_norm": 0.4407457602483392, "learning_rate": 9.896198376436144e-05, "loss": 0.0509, "step": 2740 }, { "epoch": 0.21956964349874247, "grad_norm": 0.9525928650688722, "learning_rate": 9.891762409617177e-05, "loss": 0.0751, "step": 2750 }, { "epoch": 0.22036807856601062, "grad_norm": 0.7920710965808775, "learning_rate": 9.887326442798209e-05, "loss": 0.0753, "step": 2760 }, { "epoch": 0.22116651363327877, "grad_norm": 0.6674506450378365, "learning_rate": 9.88289047597924e-05, "loss": 0.0559, "step": 2770 }, { "epoch": 0.22196494870054692, "grad_norm": 0.4032602018987511, "learning_rate": 9.878454509160272e-05, "loss": 0.0498, "step": 2780 }, { "epoch": 0.22276338376781507, "grad_norm": 0.34518737677514616, "learning_rate": 9.874018542341304e-05, "loss": 0.0729, "step": 2790 }, { "epoch": 0.22356181883508325, "grad_norm": 0.26706028708878127, "learning_rate": 9.869582575522336e-05, "loss": 0.0691, "step": 2800 }, { "epoch": 0.2243602539023514, "grad_norm": 0.43474114414320436, "learning_rate": 9.865146608703367e-05, "loss": 0.0615, "step": 2810 }, { "epoch": 0.22515868896961955, "grad_norm": 0.5283867052692955, "learning_rate": 9.860710641884399e-05, "loss": 0.071, "step": 2820 }, { "epoch": 0.2259571240368877, "grad_norm": 0.47837272161979605, "learning_rate": 9.856274675065431e-05, "loss": 0.0693, "step": 2830 }, { "epoch": 0.22675555910415585, "grad_norm": 0.7024624015245926, "learning_rate": 9.851838708246463e-05, "loss": 0.0738, "step": 2840 }, { "epoch": 0.227553994171424, "grad_norm": 0.3111551408716249, "learning_rate": 9.847402741427494e-05, "loss": 0.0476, "step": 2850 }, { "epoch": 0.22835242923869217, "grad_norm": 0.2394474908901574, "learning_rate": 9.842966774608526e-05, "loss": 0.0574, "step": 2860 }, { "epoch": 0.22915086430596032, "grad_norm": 1.1725937115127267, "learning_rate": 9.838530807789558e-05, "loss": 0.0918, "step": 2870 }, { "epoch": 0.22994929937322847, "grad_norm": 0.3956678467142246, "learning_rate": 9.83409484097059e-05, "loss": 0.0844, "step": 2880 }, { "epoch": 0.23074773444049662, "grad_norm": 0.5616900783159234, "learning_rate": 9.829658874151621e-05, "loss": 0.0773, "step": 2890 }, { "epoch": 0.23154616950776477, "grad_norm": 0.4062469324519191, "learning_rate": 9.825222907332654e-05, "loss": 0.0413, "step": 2900 }, { "epoch": 0.23234460457503295, "grad_norm": 0.8819733220912126, "learning_rate": 9.820786940513685e-05, "loss": 0.0735, "step": 2910 }, { "epoch": 0.2331430396423011, "grad_norm": 1.006357506188907, "learning_rate": 9.816350973694717e-05, "loss": 0.0786, "step": 2920 }, { "epoch": 0.23394147470956925, "grad_norm": 0.2040002764107568, "learning_rate": 9.81191500687575e-05, "loss": 0.0543, "step": 2930 }, { "epoch": 0.2347399097768374, "grad_norm": 1.4210094000753832, "learning_rate": 9.80747904005678e-05, "loss": 0.064, "step": 2940 }, { "epoch": 0.23553834484410555, "grad_norm": 0.5962753420340007, "learning_rate": 9.803043073237813e-05, "loss": 0.0615, "step": 2950 }, { "epoch": 0.2363367799113737, "grad_norm": 0.5173259768588669, "learning_rate": 9.798607106418845e-05, "loss": 0.0762, "step": 2960 }, { "epoch": 0.23713521497864187, "grad_norm": 1.2344255844869074, "learning_rate": 9.794171139599875e-05, "loss": 0.0823, "step": 2970 }, { "epoch": 0.23793365004591002, "grad_norm": 0.7619318342778804, "learning_rate": 9.789735172780908e-05, "loss": 0.0782, "step": 2980 }, { "epoch": 0.23873208511317817, "grad_norm": 0.6206950997276741, "learning_rate": 9.78529920596194e-05, "loss": 0.0664, "step": 2990 }, { "epoch": 0.23953052018044632, "grad_norm": 0.2777974840701721, "learning_rate": 9.780863239142972e-05, "loss": 0.0639, "step": 3000 }, { "epoch": 0.24032895524771447, "grad_norm": 1.2924041445825014, "learning_rate": 9.776427272324004e-05, "loss": 0.0668, "step": 3010 }, { "epoch": 0.24112739031498262, "grad_norm": 0.5118342520849409, "learning_rate": 9.771991305505035e-05, "loss": 0.0797, "step": 3020 }, { "epoch": 0.2419258253822508, "grad_norm": 0.8633704661668934, "learning_rate": 9.767555338686067e-05, "loss": 0.0797, "step": 3030 }, { "epoch": 0.24272426044951895, "grad_norm": 0.25434876668468953, "learning_rate": 9.763119371867099e-05, "loss": 0.0679, "step": 3040 }, { "epoch": 0.2435226955167871, "grad_norm": 1.0640468101763765, "learning_rate": 9.75868340504813e-05, "loss": 0.0583, "step": 3050 }, { "epoch": 0.24432113058405525, "grad_norm": 0.5614750666753393, "learning_rate": 9.754247438229162e-05, "loss": 0.079, "step": 3060 }, { "epoch": 0.2451195656513234, "grad_norm": 0.6715198464966025, "learning_rate": 9.749811471410194e-05, "loss": 0.0683, "step": 3070 }, { "epoch": 0.24591800071859157, "grad_norm": 1.291191254475961, "learning_rate": 9.745375504591227e-05, "loss": 0.0821, "step": 3080 }, { "epoch": 0.24671643578585972, "grad_norm": 0.49187729587768836, "learning_rate": 9.740939537772257e-05, "loss": 0.064, "step": 3090 }, { "epoch": 0.24751487085312787, "grad_norm": 0.20031272028441496, "learning_rate": 9.736503570953289e-05, "loss": 0.0518, "step": 3100 }, { "epoch": 0.24831330592039602, "grad_norm": 0.7593839309467876, "learning_rate": 9.732067604134322e-05, "loss": 0.085, "step": 3110 }, { "epoch": 0.24911174098766417, "grad_norm": 1.5421303335254006, "learning_rate": 9.727631637315353e-05, "loss": 0.0951, "step": 3120 }, { "epoch": 0.24991017605493232, "grad_norm": 0.6102084147035887, "learning_rate": 9.723195670496386e-05, "loss": 0.0655, "step": 3130 }, { "epoch": 0.2507086111222005, "grad_norm": 0.44309051140710953, "learning_rate": 9.718759703677418e-05, "loss": 0.0692, "step": 3140 }, { "epoch": 0.2515070461894686, "grad_norm": 0.2800764130796373, "learning_rate": 9.714323736858448e-05, "loss": 0.0566, "step": 3150 }, { "epoch": 0.2523054812567368, "grad_norm": 0.33225924131316076, "learning_rate": 9.709887770039481e-05, "loss": 0.0555, "step": 3160 }, { "epoch": 0.253103916324005, "grad_norm": 0.28000037239354675, "learning_rate": 9.705451803220513e-05, "loss": 0.0549, "step": 3170 }, { "epoch": 0.2539023513912731, "grad_norm": 0.5510944129584439, "learning_rate": 9.701015836401544e-05, "loss": 0.0699, "step": 3180 }, { "epoch": 0.2547007864585413, "grad_norm": 0.32418363687489393, "learning_rate": 9.696579869582576e-05, "loss": 0.0686, "step": 3190 }, { "epoch": 0.2554992215258094, "grad_norm": 0.6754324134221963, "learning_rate": 9.692143902763608e-05, "loss": 0.071, "step": 3200 }, { "epoch": 0.25629765659307757, "grad_norm": 1.022738360263931, "learning_rate": 9.68770793594464e-05, "loss": 0.0691, "step": 3210 }, { "epoch": 0.25709609166034575, "grad_norm": 0.8622983849669558, "learning_rate": 9.683271969125671e-05, "loss": 0.093, "step": 3220 }, { "epoch": 0.25789452672761387, "grad_norm": 0.5050331506487474, "learning_rate": 9.678836002306703e-05, "loss": 0.0631, "step": 3230 }, { "epoch": 0.25869296179488205, "grad_norm": 1.0255097116123932, "learning_rate": 9.674400035487735e-05, "loss": 0.0563, "step": 3240 }, { "epoch": 0.25949139686215017, "grad_norm": 0.4240474303571155, "learning_rate": 9.669964068668767e-05, "loss": 0.0746, "step": 3250 }, { "epoch": 0.26028983192941835, "grad_norm": 0.17675298277758164, "learning_rate": 9.665528101849798e-05, "loss": 0.0673, "step": 3260 }, { "epoch": 0.26108826699668647, "grad_norm": 0.3869745350922928, "learning_rate": 9.66109213503083e-05, "loss": 0.1101, "step": 3270 }, { "epoch": 0.26188670206395465, "grad_norm": 0.15099346361631436, "learning_rate": 9.656656168211862e-05, "loss": 0.0728, "step": 3280 }, { "epoch": 0.2626851371312228, "grad_norm": 0.7320323894609118, "learning_rate": 9.652220201392894e-05, "loss": 0.0712, "step": 3290 }, { "epoch": 0.26348357219849095, "grad_norm": 0.13397678878907418, "learning_rate": 9.647784234573925e-05, "loss": 0.0557, "step": 3300 }, { "epoch": 0.2642820072657591, "grad_norm": 0.4705351472047174, "learning_rate": 9.643348267754958e-05, "loss": 0.0599, "step": 3310 }, { "epoch": 0.26508044233302724, "grad_norm": 1.24551689645886, "learning_rate": 9.638912300935989e-05, "loss": 0.0589, "step": 3320 }, { "epoch": 0.2658788774002954, "grad_norm": 0.24989018849005057, "learning_rate": 9.63447633411702e-05, "loss": 0.0731, "step": 3330 }, { "epoch": 0.2666773124675636, "grad_norm": 0.4633263939700149, "learning_rate": 9.630040367298054e-05, "loss": 0.0525, "step": 3340 }, { "epoch": 0.2674757475348317, "grad_norm": 0.5755597975991337, "learning_rate": 9.625604400479084e-05, "loss": 0.0742, "step": 3350 }, { "epoch": 0.2682741826020999, "grad_norm": 0.3891342346275618, "learning_rate": 9.621168433660117e-05, "loss": 0.059, "step": 3360 }, { "epoch": 0.269072617669368, "grad_norm": 0.13349742033851617, "learning_rate": 9.616732466841149e-05, "loss": 0.0548, "step": 3370 }, { "epoch": 0.2698710527366362, "grad_norm": 0.48026841764167816, "learning_rate": 9.612296500022179e-05, "loss": 0.0668, "step": 3380 }, { "epoch": 0.2706694878039044, "grad_norm": 0.7808325364960803, "learning_rate": 9.607860533203212e-05, "loss": 0.0704, "step": 3390 }, { "epoch": 0.2714679228711725, "grad_norm": 0.4864712450283722, "learning_rate": 9.603424566384244e-05, "loss": 0.0713, "step": 3400 }, { "epoch": 0.2722663579384407, "grad_norm": 0.887232498565028, "learning_rate": 9.598988599565276e-05, "loss": 0.0525, "step": 3410 }, { "epoch": 0.2730647930057088, "grad_norm": 0.8234101548616632, "learning_rate": 9.594552632746308e-05, "loss": 0.0733, "step": 3420 }, { "epoch": 0.27386322807297697, "grad_norm": 0.5022871225844846, "learning_rate": 9.590116665927339e-05, "loss": 0.0547, "step": 3430 }, { "epoch": 0.2746616631402451, "grad_norm": 2.1522421476954547, "learning_rate": 9.585680699108371e-05, "loss": 0.0668, "step": 3440 }, { "epoch": 0.27546009820751327, "grad_norm": 1.3741601077902381, "learning_rate": 9.581244732289403e-05, "loss": 0.0658, "step": 3450 }, { "epoch": 0.27625853327478145, "grad_norm": 0.6094718464750811, "learning_rate": 9.576808765470436e-05, "loss": 0.0662, "step": 3460 }, { "epoch": 0.27705696834204957, "grad_norm": 0.3305485987290091, "learning_rate": 9.572372798651466e-05, "loss": 0.0578, "step": 3470 }, { "epoch": 0.27785540340931775, "grad_norm": 0.6526066084303576, "learning_rate": 9.567936831832498e-05, "loss": 0.0746, "step": 3480 }, { "epoch": 0.27865383847658587, "grad_norm": 0.9842490963836676, "learning_rate": 9.563500865013531e-05, "loss": 0.0631, "step": 3490 }, { "epoch": 0.27945227354385405, "grad_norm": 0.8835467687651831, "learning_rate": 9.559064898194561e-05, "loss": 0.0875, "step": 3500 }, { "epoch": 0.2802507086111222, "grad_norm": 0.781469027598919, "learning_rate": 9.554628931375593e-05, "loss": 0.101, "step": 3510 }, { "epoch": 0.28104914367839035, "grad_norm": 0.8106723090645768, "learning_rate": 9.550192964556626e-05, "loss": 0.0668, "step": 3520 }, { "epoch": 0.2818475787456585, "grad_norm": 0.46004322835035094, "learning_rate": 9.545756997737657e-05, "loss": 0.0694, "step": 3530 }, { "epoch": 0.28264601381292664, "grad_norm": 0.5784098668603113, "learning_rate": 9.54132103091869e-05, "loss": 0.0688, "step": 3540 }, { "epoch": 0.2834444488801948, "grad_norm": 0.6629513380474472, "learning_rate": 9.536885064099722e-05, "loss": 0.0608, "step": 3550 }, { "epoch": 0.284242883947463, "grad_norm": 0.19931079062599605, "learning_rate": 9.532449097280752e-05, "loss": 0.0841, "step": 3560 }, { "epoch": 0.2850413190147311, "grad_norm": 0.7951033718666626, "learning_rate": 9.528013130461785e-05, "loss": 0.0527, "step": 3570 }, { "epoch": 0.2858397540819993, "grad_norm": 0.9565484508249654, "learning_rate": 9.523577163642817e-05, "loss": 0.0482, "step": 3580 }, { "epoch": 0.2866381891492674, "grad_norm": 0.8392384050951297, "learning_rate": 9.519141196823848e-05, "loss": 0.0633, "step": 3590 }, { "epoch": 0.2874366242165356, "grad_norm": 0.5717780924727565, "learning_rate": 9.51470523000488e-05, "loss": 0.0449, "step": 3600 }, { "epoch": 0.2882350592838037, "grad_norm": 0.4189642837284483, "learning_rate": 9.510269263185912e-05, "loss": 0.0488, "step": 3610 }, { "epoch": 0.2890334943510719, "grad_norm": 0.2488194601811155, "learning_rate": 9.505833296366944e-05, "loss": 0.0654, "step": 3620 }, { "epoch": 0.2898319294183401, "grad_norm": 0.4890100046395099, "learning_rate": 9.501397329547975e-05, "loss": 0.0531, "step": 3630 }, { "epoch": 0.2906303644856082, "grad_norm": 0.6529603939573656, "learning_rate": 9.496961362729007e-05, "loss": 0.0761, "step": 3640 }, { "epoch": 0.29142879955287637, "grad_norm": 0.1019537391373853, "learning_rate": 9.492525395910039e-05, "loss": 0.0467, "step": 3650 }, { "epoch": 0.2922272346201445, "grad_norm": 0.17618204759123188, "learning_rate": 9.48808942909107e-05, "loss": 0.0554, "step": 3660 }, { "epoch": 0.29302566968741267, "grad_norm": 0.5353417707615883, "learning_rate": 9.483653462272102e-05, "loss": 0.0586, "step": 3670 }, { "epoch": 0.29382410475468085, "grad_norm": 0.5391605538942996, "learning_rate": 9.479217495453134e-05, "loss": 0.057, "step": 3680 }, { "epoch": 0.29462253982194897, "grad_norm": 0.19771138219286866, "learning_rate": 9.474781528634167e-05, "loss": 0.0715, "step": 3690 }, { "epoch": 0.29542097488921715, "grad_norm": 1.491295495505135, "learning_rate": 9.470345561815198e-05, "loss": 0.0708, "step": 3700 }, { "epoch": 0.29621940995648527, "grad_norm": 0.13512836092147384, "learning_rate": 9.465909594996229e-05, "loss": 0.0581, "step": 3710 }, { "epoch": 0.29701784502375345, "grad_norm": 1.5095520257451078, "learning_rate": 9.461473628177262e-05, "loss": 0.0559, "step": 3720 }, { "epoch": 0.2978162800910216, "grad_norm": 1.1545077382657083, "learning_rate": 9.457037661358293e-05, "loss": 0.0673, "step": 3730 }, { "epoch": 0.29861471515828975, "grad_norm": 1.3565052045468842, "learning_rate": 9.452601694539325e-05, "loss": 0.0759, "step": 3740 }, { "epoch": 0.2994131502255579, "grad_norm": 0.2761557893990645, "learning_rate": 9.448165727720358e-05, "loss": 0.0729, "step": 3750 }, { "epoch": 0.30021158529282604, "grad_norm": 0.6113839039760615, "learning_rate": 9.443729760901388e-05, "loss": 0.0804, "step": 3760 }, { "epoch": 0.3010100203600942, "grad_norm": 1.1193210205165771, "learning_rate": 9.439293794082421e-05, "loss": 0.0607, "step": 3770 }, { "epoch": 0.30180845542736234, "grad_norm": 1.2227261887092338, "learning_rate": 9.434857827263453e-05, "loss": 0.0887, "step": 3780 }, { "epoch": 0.3026068904946305, "grad_norm": 0.6336848150580261, "learning_rate": 9.430421860444483e-05, "loss": 0.0894, "step": 3790 }, { "epoch": 0.3034053255618987, "grad_norm": 0.3976035265311596, "learning_rate": 9.425985893625516e-05, "loss": 0.0757, "step": 3800 }, { "epoch": 0.3042037606291668, "grad_norm": 0.3575402571045892, "learning_rate": 9.421549926806548e-05, "loss": 0.0803, "step": 3810 }, { "epoch": 0.305002195696435, "grad_norm": 0.5648417782171387, "learning_rate": 9.41711395998758e-05, "loss": 0.0617, "step": 3820 }, { "epoch": 0.3058006307637031, "grad_norm": 0.2822352199969617, "learning_rate": 9.412677993168612e-05, "loss": 0.0802, "step": 3830 }, { "epoch": 0.3065990658309713, "grad_norm": 0.13975805465865077, "learning_rate": 9.408242026349643e-05, "loss": 0.0652, "step": 3840 }, { "epoch": 0.3073975008982395, "grad_norm": 0.7969925852775963, "learning_rate": 9.403806059530675e-05, "loss": 0.0369, "step": 3850 }, { "epoch": 0.3081959359655076, "grad_norm": 0.35201376066581236, "learning_rate": 9.399370092711707e-05, "loss": 0.0478, "step": 3860 }, { "epoch": 0.30899437103277577, "grad_norm": 0.6213359726373199, "learning_rate": 9.39493412589274e-05, "loss": 0.0793, "step": 3870 }, { "epoch": 0.3097928061000439, "grad_norm": 0.19355341743297388, "learning_rate": 9.39049815907377e-05, "loss": 0.0663, "step": 3880 }, { "epoch": 0.31059124116731207, "grad_norm": 0.1777608148662464, "learning_rate": 9.386062192254802e-05, "loss": 0.066, "step": 3890 }, { "epoch": 0.31138967623458025, "grad_norm": 0.3915213687567011, "learning_rate": 9.381626225435835e-05, "loss": 0.0462, "step": 3900 }, { "epoch": 0.31218811130184837, "grad_norm": 0.45632356085935716, "learning_rate": 9.377190258616865e-05, "loss": 0.0543, "step": 3910 }, { "epoch": 0.31298654636911655, "grad_norm": 0.14412321773869619, "learning_rate": 9.372754291797899e-05, "loss": 0.0574, "step": 3920 }, { "epoch": 0.31378498143638467, "grad_norm": 0.2525773726678694, "learning_rate": 9.36831832497893e-05, "loss": 0.0737, "step": 3930 }, { "epoch": 0.31458341650365285, "grad_norm": 0.8403006704325723, "learning_rate": 9.36388235815996e-05, "loss": 0.0939, "step": 3940 }, { "epoch": 0.31538185157092097, "grad_norm": 0.6521623803536748, "learning_rate": 9.359446391340994e-05, "loss": 0.058, "step": 3950 }, { "epoch": 0.31618028663818915, "grad_norm": 0.3547147871976421, "learning_rate": 9.355010424522025e-05, "loss": 0.0631, "step": 3960 }, { "epoch": 0.3169787217054573, "grad_norm": 1.213057907328134, "learning_rate": 9.350574457703056e-05, "loss": 0.0709, "step": 3970 }, { "epoch": 0.31777715677272544, "grad_norm": 0.8145837560797573, "learning_rate": 9.346138490884089e-05, "loss": 0.0789, "step": 3980 }, { "epoch": 0.3185755918399936, "grad_norm": 0.31619564069314254, "learning_rate": 9.341702524065121e-05, "loss": 0.0592, "step": 3990 }, { "epoch": 0.31937402690726174, "grad_norm": 0.1999948652995334, "learning_rate": 9.337266557246152e-05, "loss": 0.0611, "step": 4000 }, { "epoch": 0.3201724619745299, "grad_norm": 0.9048703932371391, "learning_rate": 9.332830590427184e-05, "loss": 0.0626, "step": 4010 }, { "epoch": 0.3209708970417981, "grad_norm": 0.581399417609177, "learning_rate": 9.328394623608216e-05, "loss": 0.0752, "step": 4020 }, { "epoch": 0.3217693321090662, "grad_norm": 0.3120397134500295, "learning_rate": 9.323958656789248e-05, "loss": 0.0559, "step": 4030 }, { "epoch": 0.3225677671763344, "grad_norm": 0.40722820540078686, "learning_rate": 9.31952268997028e-05, "loss": 0.0667, "step": 4040 }, { "epoch": 0.3233662022436025, "grad_norm": 0.49952846455914823, "learning_rate": 9.315086723151311e-05, "loss": 0.0699, "step": 4050 }, { "epoch": 0.3241646373108707, "grad_norm": 0.5499884570539099, "learning_rate": 9.310650756332343e-05, "loss": 0.0987, "step": 4060 }, { "epoch": 0.3249630723781389, "grad_norm": 0.705022079278504, "learning_rate": 9.306214789513375e-05, "loss": 0.0708, "step": 4070 }, { "epoch": 0.325761507445407, "grad_norm": 0.4639845211871033, "learning_rate": 9.301778822694406e-05, "loss": 0.0619, "step": 4080 }, { "epoch": 0.32655994251267517, "grad_norm": 0.20800711837138103, "learning_rate": 9.297342855875438e-05, "loss": 0.0428, "step": 4090 }, { "epoch": 0.3273583775799433, "grad_norm": 0.7247882394307614, "learning_rate": 9.292906889056471e-05, "loss": 0.0664, "step": 4100 }, { "epoch": 0.32815681264721147, "grad_norm": 0.7582305969619678, "learning_rate": 9.288470922237502e-05, "loss": 0.0482, "step": 4110 }, { "epoch": 0.3289552477144796, "grad_norm": 0.3892556989008362, "learning_rate": 9.284034955418533e-05, "loss": 0.0709, "step": 4120 }, { "epoch": 0.32975368278174777, "grad_norm": 0.28872693985562875, "learning_rate": 9.279598988599566e-05, "loss": 0.0382, "step": 4130 }, { "epoch": 0.33055211784901595, "grad_norm": 0.3375027856146073, "learning_rate": 9.275163021780597e-05, "loss": 0.0604, "step": 4140 }, { "epoch": 0.33135055291628407, "grad_norm": 0.7959867830511765, "learning_rate": 9.27072705496163e-05, "loss": 0.058, "step": 4150 }, { "epoch": 0.33214898798355225, "grad_norm": 1.113370323153677, "learning_rate": 9.266291088142662e-05, "loss": 0.0657, "step": 4160 }, { "epoch": 0.33294742305082037, "grad_norm": 0.3894855633297203, "learning_rate": 9.261855121323692e-05, "loss": 0.0606, "step": 4170 }, { "epoch": 0.33374585811808855, "grad_norm": 1.6682181003177796, "learning_rate": 9.257419154504725e-05, "loss": 0.0575, "step": 4180 }, { "epoch": 0.3345442931853567, "grad_norm": 0.4147451777526778, "learning_rate": 9.252983187685757e-05, "loss": 0.0757, "step": 4190 }, { "epoch": 0.33534272825262484, "grad_norm": 1.4313508327315267, "learning_rate": 9.248547220866787e-05, "loss": 0.0816, "step": 4200 }, { "epoch": 0.336141163319893, "grad_norm": 0.8629267807236959, "learning_rate": 9.24411125404782e-05, "loss": 0.0658, "step": 4210 }, { "epoch": 0.33693959838716114, "grad_norm": 0.14128184424301535, "learning_rate": 9.239675287228852e-05, "loss": 0.0519, "step": 4220 }, { "epoch": 0.3377380334544293, "grad_norm": 0.4930129021270943, "learning_rate": 9.235239320409884e-05, "loss": 0.0599, "step": 4230 }, { "epoch": 0.3385364685216975, "grad_norm": 0.6045942919945482, "learning_rate": 9.230803353590915e-05, "loss": 0.078, "step": 4240 }, { "epoch": 0.3393349035889656, "grad_norm": 0.5299359565226784, "learning_rate": 9.226367386771947e-05, "loss": 0.07, "step": 4250 }, { "epoch": 0.3401333386562338, "grad_norm": 0.32803420167901814, "learning_rate": 9.221931419952979e-05, "loss": 0.0677, "step": 4260 }, { "epoch": 0.3409317737235019, "grad_norm": 0.5736143342256715, "learning_rate": 9.217495453134011e-05, "loss": 0.0813, "step": 4270 }, { "epoch": 0.3417302087907701, "grad_norm": 0.5006873224659751, "learning_rate": 9.213059486315044e-05, "loss": 0.0919, "step": 4280 }, { "epoch": 0.3425286438580382, "grad_norm": 0.7189402066951537, "learning_rate": 9.208623519496074e-05, "loss": 0.0664, "step": 4290 }, { "epoch": 0.3433270789253064, "grad_norm": 1.1042243525293143, "learning_rate": 9.204187552677106e-05, "loss": 0.0837, "step": 4300 }, { "epoch": 0.34412551399257457, "grad_norm": 1.3047043218797034, "learning_rate": 9.199751585858139e-05, "loss": 0.0508, "step": 4310 }, { "epoch": 0.3449239490598427, "grad_norm": 0.7076312109495163, "learning_rate": 9.19531561903917e-05, "loss": 0.0574, "step": 4320 }, { "epoch": 0.34572238412711087, "grad_norm": 0.20680949689309577, "learning_rate": 9.190879652220202e-05, "loss": 0.0525, "step": 4330 }, { "epoch": 0.346520819194379, "grad_norm": 0.3035802107484262, "learning_rate": 9.186443685401234e-05, "loss": 0.0657, "step": 4340 }, { "epoch": 0.34731925426164717, "grad_norm": 0.7911695899519386, "learning_rate": 9.182007718582265e-05, "loss": 0.0849, "step": 4350 }, { "epoch": 0.34811768932891535, "grad_norm": 0.5895064283440106, "learning_rate": 9.177571751763298e-05, "loss": 0.0761, "step": 4360 }, { "epoch": 0.34891612439618347, "grad_norm": 0.35415376481925265, "learning_rate": 9.17313578494433e-05, "loss": 0.0713, "step": 4370 }, { "epoch": 0.34971455946345165, "grad_norm": 0.5419823106587008, "learning_rate": 9.168699818125361e-05, "loss": 0.0669, "step": 4380 }, { "epoch": 0.35051299453071977, "grad_norm": 0.8830483510160156, "learning_rate": 9.164263851306393e-05, "loss": 0.0735, "step": 4390 }, { "epoch": 0.35131142959798795, "grad_norm": 1.049664144394219, "learning_rate": 9.159827884487425e-05, "loss": 0.0653, "step": 4400 }, { "epoch": 0.3521098646652561, "grad_norm": 0.23432670361744762, "learning_rate": 9.155391917668456e-05, "loss": 0.0365, "step": 4410 }, { "epoch": 0.35290829973252424, "grad_norm": 0.6699954512353271, "learning_rate": 9.150955950849488e-05, "loss": 0.0573, "step": 4420 }, { "epoch": 0.3537067347997924, "grad_norm": 0.40988692483619393, "learning_rate": 9.14651998403052e-05, "loss": 0.0598, "step": 4430 }, { "epoch": 0.35450516986706054, "grad_norm": 0.30987019795651377, "learning_rate": 9.142084017211552e-05, "loss": 0.0583, "step": 4440 }, { "epoch": 0.3553036049343287, "grad_norm": 1.5473769973813984, "learning_rate": 9.137648050392583e-05, "loss": 0.088, "step": 4450 }, { "epoch": 0.3561020400015969, "grad_norm": 1.0985488042237361, "learning_rate": 9.133212083573615e-05, "loss": 0.0808, "step": 4460 }, { "epoch": 0.356900475068865, "grad_norm": 0.47391395596589303, "learning_rate": 9.128776116754647e-05, "loss": 0.0598, "step": 4470 }, { "epoch": 0.3576989101361332, "grad_norm": 0.16818095830793228, "learning_rate": 9.124340149935679e-05, "loss": 0.0713, "step": 4480 }, { "epoch": 0.3584973452034013, "grad_norm": 0.6284876149603, "learning_rate": 9.11990418311671e-05, "loss": 0.0791, "step": 4490 }, { "epoch": 0.3592957802706695, "grad_norm": 1.1532137182508528, "learning_rate": 9.115468216297742e-05, "loss": 0.0788, "step": 4500 }, { "epoch": 0.3600942153379376, "grad_norm": 0.38319530479006925, "learning_rate": 9.111032249478775e-05, "loss": 0.0706, "step": 4510 }, { "epoch": 0.3608926504052058, "grad_norm": 0.5929092448145604, "learning_rate": 9.106596282659806e-05, "loss": 0.0541, "step": 4520 }, { "epoch": 0.36169108547247397, "grad_norm": 0.40753898344676914, "learning_rate": 9.102160315840837e-05, "loss": 0.0711, "step": 4530 }, { "epoch": 0.3624895205397421, "grad_norm": 0.9834740305156284, "learning_rate": 9.09772434902187e-05, "loss": 0.075, "step": 4540 }, { "epoch": 0.36328795560701027, "grad_norm": 0.525576434370597, "learning_rate": 9.093288382202901e-05, "loss": 0.0544, "step": 4550 }, { "epoch": 0.3640863906742784, "grad_norm": 0.865321793226859, "learning_rate": 9.088852415383934e-05, "loss": 0.0629, "step": 4560 }, { "epoch": 0.36488482574154657, "grad_norm": 0.6423714230291824, "learning_rate": 9.084416448564966e-05, "loss": 0.0642, "step": 4570 }, { "epoch": 0.36568326080881475, "grad_norm": 0.5137380114454171, "learning_rate": 9.079980481745996e-05, "loss": 0.084, "step": 4580 }, { "epoch": 0.36648169587608287, "grad_norm": 0.19927820633025783, "learning_rate": 9.075544514927029e-05, "loss": 0.0903, "step": 4590 }, { "epoch": 0.36728013094335105, "grad_norm": 1.058178203153698, "learning_rate": 9.071108548108061e-05, "loss": 0.0831, "step": 4600 }, { "epoch": 0.36807856601061917, "grad_norm": 1.249280563738633, "learning_rate": 9.066672581289093e-05, "loss": 0.068, "step": 4610 }, { "epoch": 0.36887700107788735, "grad_norm": 0.5807254981900888, "learning_rate": 9.062236614470124e-05, "loss": 0.0591, "step": 4620 }, { "epoch": 0.3696754361451555, "grad_norm": 0.5441909496135758, "learning_rate": 9.057800647651156e-05, "loss": 0.0581, "step": 4630 }, { "epoch": 0.37047387121242364, "grad_norm": 1.6412220260958208, "learning_rate": 9.053364680832188e-05, "loss": 0.0759, "step": 4640 }, { "epoch": 0.3712723062796918, "grad_norm": 0.7095159700544676, "learning_rate": 9.04892871401322e-05, "loss": 0.0473, "step": 4650 }, { "epoch": 0.37207074134695994, "grad_norm": 0.260058630317952, "learning_rate": 9.044492747194251e-05, "loss": 0.0461, "step": 4660 }, { "epoch": 0.3728691764142281, "grad_norm": 0.22182297181489585, "learning_rate": 9.040056780375283e-05, "loss": 0.061, "step": 4670 }, { "epoch": 0.37366761148149624, "grad_norm": 1.0224917304159058, "learning_rate": 9.035620813556315e-05, "loss": 0.0552, "step": 4680 }, { "epoch": 0.3744660465487644, "grad_norm": 0.28268273613408273, "learning_rate": 9.031184846737348e-05, "loss": 0.0496, "step": 4690 }, { "epoch": 0.3752644816160326, "grad_norm": 0.7027656925839558, "learning_rate": 9.026748879918378e-05, "loss": 0.0476, "step": 4700 }, { "epoch": 0.3760629166833007, "grad_norm": 0.3836861736397752, "learning_rate": 9.02231291309941e-05, "loss": 0.0652, "step": 4710 }, { "epoch": 0.3768613517505689, "grad_norm": 0.3397625493924417, "learning_rate": 9.017876946280443e-05, "loss": 0.0834, "step": 4720 }, { "epoch": 0.377659786817837, "grad_norm": 0.48750058843343874, "learning_rate": 9.013440979461473e-05, "loss": 0.0574, "step": 4730 }, { "epoch": 0.3784582218851052, "grad_norm": 0.5010802768596067, "learning_rate": 9.009005012642506e-05, "loss": 0.0623, "step": 4740 }, { "epoch": 0.37925665695237337, "grad_norm": 1.3902600676959112, "learning_rate": 9.004569045823538e-05, "loss": 0.0747, "step": 4750 }, { "epoch": 0.3800550920196415, "grad_norm": 0.6234122056900067, "learning_rate": 9.000133079004569e-05, "loss": 0.0478, "step": 4760 }, { "epoch": 0.38085352708690967, "grad_norm": 0.5685712290921905, "learning_rate": 8.995697112185602e-05, "loss": 0.0887, "step": 4770 }, { "epoch": 0.3816519621541778, "grad_norm": 0.565347440171693, "learning_rate": 8.991261145366633e-05, "loss": 0.0684, "step": 4780 }, { "epoch": 0.38245039722144597, "grad_norm": 0.5939386619028115, "learning_rate": 8.986825178547665e-05, "loss": 0.0656, "step": 4790 }, { "epoch": 0.38324883228871415, "grad_norm": 0.35600920379795853, "learning_rate": 8.982389211728697e-05, "loss": 0.0667, "step": 4800 }, { "epoch": 0.38404726735598227, "grad_norm": 0.3450273603243629, "learning_rate": 8.977953244909729e-05, "loss": 0.0768, "step": 4810 }, { "epoch": 0.38484570242325045, "grad_norm": 0.7355536879981746, "learning_rate": 8.97351727809076e-05, "loss": 0.0784, "step": 4820 }, { "epoch": 0.38564413749051857, "grad_norm": 0.6358098164509965, "learning_rate": 8.969081311271792e-05, "loss": 0.0482, "step": 4830 }, { "epoch": 0.38644257255778675, "grad_norm": 0.8836154598539316, "learning_rate": 8.964645344452824e-05, "loss": 0.0668, "step": 4840 }, { "epoch": 0.38724100762505487, "grad_norm": 0.24586284011234355, "learning_rate": 8.960209377633856e-05, "loss": 0.0444, "step": 4850 }, { "epoch": 0.38803944269232304, "grad_norm": 0.31228441399739965, "learning_rate": 8.955773410814887e-05, "loss": 0.0546, "step": 4860 }, { "epoch": 0.3888378777595912, "grad_norm": 1.3382422056050238, "learning_rate": 8.951337443995919e-05, "loss": 0.0637, "step": 4870 }, { "epoch": 0.38963631282685934, "grad_norm": 0.444796329308194, "learning_rate": 8.946901477176951e-05, "loss": 0.0543, "step": 4880 }, { "epoch": 0.3904347478941275, "grad_norm": 0.23308267264899266, "learning_rate": 8.942465510357983e-05, "loss": 0.067, "step": 4890 }, { "epoch": 0.39123318296139564, "grad_norm": 0.6401991484412825, "learning_rate": 8.938029543539014e-05, "loss": 0.0521, "step": 4900 }, { "epoch": 0.3920316180286638, "grad_norm": 0.22834355519756833, "learning_rate": 8.933593576720046e-05, "loss": 0.0431, "step": 4910 }, { "epoch": 0.392830053095932, "grad_norm": 0.7232285060113931, "learning_rate": 8.929157609901079e-05, "loss": 0.0547, "step": 4920 }, { "epoch": 0.3936284881632001, "grad_norm": 0.456986877784416, "learning_rate": 8.92472164308211e-05, "loss": 0.0662, "step": 4930 }, { "epoch": 0.3944269232304683, "grad_norm": 0.25476686741206894, "learning_rate": 8.920285676263141e-05, "loss": 0.0557, "step": 4940 }, { "epoch": 0.3952253582977364, "grad_norm": 0.4027583008470791, "learning_rate": 8.915849709444174e-05, "loss": 0.0559, "step": 4950 }, { "epoch": 0.3960237933650046, "grad_norm": 0.14934067044491264, "learning_rate": 8.911413742625205e-05, "loss": 0.0656, "step": 4960 }, { "epoch": 0.39682222843227277, "grad_norm": 0.14225234528381334, "learning_rate": 8.906977775806238e-05, "loss": 0.0504, "step": 4970 }, { "epoch": 0.3976206634995409, "grad_norm": 1.1907087935137126, "learning_rate": 8.90254180898727e-05, "loss": 0.071, "step": 4980 }, { "epoch": 0.39841909856680907, "grad_norm": 0.27933025425637376, "learning_rate": 8.8981058421683e-05, "loss": 0.0566, "step": 4990 }, { "epoch": 0.3992175336340772, "grad_norm": 0.6083068308403123, "learning_rate": 8.893669875349333e-05, "loss": 0.0662, "step": 5000 }, { "epoch": 0.40001596870134537, "grad_norm": 1.1884762791366723, "learning_rate": 8.889233908530365e-05, "loss": 0.0676, "step": 5010 }, { "epoch": 0.4008144037686135, "grad_norm": 1.0558648065537648, "learning_rate": 8.884797941711396e-05, "loss": 0.0705, "step": 5020 }, { "epoch": 0.40161283883588167, "grad_norm": 0.4289983200286532, "learning_rate": 8.880361974892428e-05, "loss": 0.0727, "step": 5030 }, { "epoch": 0.40241127390314985, "grad_norm": 0.4911531992853839, "learning_rate": 8.87592600807346e-05, "loss": 0.0714, "step": 5040 }, { "epoch": 0.40320970897041797, "grad_norm": 0.31150842781342475, "learning_rate": 8.871490041254492e-05, "loss": 0.0528, "step": 5050 }, { "epoch": 0.40400814403768615, "grad_norm": 0.6489978032345287, "learning_rate": 8.867054074435523e-05, "loss": 0.0751, "step": 5060 }, { "epoch": 0.40480657910495427, "grad_norm": 0.419948904458347, "learning_rate": 8.862618107616557e-05, "loss": 0.062, "step": 5070 }, { "epoch": 0.40560501417222244, "grad_norm": 0.2183663052059662, "learning_rate": 8.858182140797587e-05, "loss": 0.0638, "step": 5080 }, { "epoch": 0.4064034492394906, "grad_norm": 0.26154611890494367, "learning_rate": 8.853746173978619e-05, "loss": 0.0483, "step": 5090 }, { "epoch": 0.40720188430675874, "grad_norm": 1.4954231531687736, "learning_rate": 8.849310207159652e-05, "loss": 0.0943, "step": 5100 }, { "epoch": 0.4080003193740269, "grad_norm": 0.3045976586748024, "learning_rate": 8.844874240340682e-05, "loss": 0.0679, "step": 5110 }, { "epoch": 0.40879875444129504, "grad_norm": 0.36479206890797944, "learning_rate": 8.840438273521714e-05, "loss": 0.0614, "step": 5120 }, { "epoch": 0.4095971895085632, "grad_norm": 0.2676201553843575, "learning_rate": 8.836002306702747e-05, "loss": 0.0412, "step": 5130 }, { "epoch": 0.4103956245758314, "grad_norm": 1.0386926051609444, "learning_rate": 8.831566339883777e-05, "loss": 0.0748, "step": 5140 }, { "epoch": 0.4111940596430995, "grad_norm": 0.3438745636536693, "learning_rate": 8.82713037306481e-05, "loss": 0.0603, "step": 5150 }, { "epoch": 0.4119924947103677, "grad_norm": 0.6260398748757086, "learning_rate": 8.822694406245842e-05, "loss": 0.0845, "step": 5160 }, { "epoch": 0.4127909297776358, "grad_norm": 0.5703119341755367, "learning_rate": 8.818258439426873e-05, "loss": 0.073, "step": 5170 }, { "epoch": 0.413589364844904, "grad_norm": 0.6937887146171914, "learning_rate": 8.813822472607906e-05, "loss": 0.0643, "step": 5180 }, { "epoch": 0.4143877999121721, "grad_norm": 1.1824674908984751, "learning_rate": 8.809386505788937e-05, "loss": 0.0754, "step": 5190 }, { "epoch": 0.4151862349794403, "grad_norm": 1.7614692575489068, "learning_rate": 8.804950538969969e-05, "loss": 0.1003, "step": 5200 }, { "epoch": 0.41598467004670847, "grad_norm": 0.3060089486060604, "learning_rate": 8.800514572151001e-05, "loss": 0.0715, "step": 5210 }, { "epoch": 0.4167831051139766, "grad_norm": 0.3910737594616741, "learning_rate": 8.796078605332033e-05, "loss": 0.0508, "step": 5220 }, { "epoch": 0.41758154018124477, "grad_norm": 0.19993085932726584, "learning_rate": 8.791642638513064e-05, "loss": 0.0573, "step": 5230 }, { "epoch": 0.4183799752485129, "grad_norm": 0.27516676488467984, "learning_rate": 8.787206671694096e-05, "loss": 0.0653, "step": 5240 }, { "epoch": 0.41917841031578107, "grad_norm": 0.6167919795211267, "learning_rate": 8.782770704875128e-05, "loss": 0.0942, "step": 5250 }, { "epoch": 0.41997684538304925, "grad_norm": 0.27936432847287795, "learning_rate": 8.77833473805616e-05, "loss": 0.0738, "step": 5260 }, { "epoch": 0.42077528045031737, "grad_norm": 0.7588252383320675, "learning_rate": 8.773898771237191e-05, "loss": 0.0523, "step": 5270 }, { "epoch": 0.42157371551758555, "grad_norm": 0.06949164789983199, "learning_rate": 8.769462804418223e-05, "loss": 0.0795, "step": 5280 }, { "epoch": 0.42237215058485367, "grad_norm": 0.3957542748855769, "learning_rate": 8.765026837599255e-05, "loss": 0.0865, "step": 5290 }, { "epoch": 0.42317058565212184, "grad_norm": 0.280739201904563, "learning_rate": 8.760590870780288e-05, "loss": 0.0508, "step": 5300 }, { "epoch": 0.42396902071939, "grad_norm": 0.5673662143218914, "learning_rate": 8.756154903961318e-05, "loss": 0.0744, "step": 5310 }, { "epoch": 0.42476745578665814, "grad_norm": 0.47405142279811713, "learning_rate": 8.75171893714235e-05, "loss": 0.0616, "step": 5320 }, { "epoch": 0.4255658908539263, "grad_norm": 0.1281058538132511, "learning_rate": 8.747282970323383e-05, "loss": 0.0434, "step": 5330 }, { "epoch": 0.42636432592119444, "grad_norm": 0.4979701542404763, "learning_rate": 8.742847003504413e-05, "loss": 0.0572, "step": 5340 }, { "epoch": 0.4271627609884626, "grad_norm": 1.4594895339218368, "learning_rate": 8.738411036685445e-05, "loss": 0.077, "step": 5350 }, { "epoch": 0.42796119605573074, "grad_norm": 0.41948570674673674, "learning_rate": 8.733975069866478e-05, "loss": 0.0654, "step": 5360 }, { "epoch": 0.4287596311229989, "grad_norm": 0.7352581247356831, "learning_rate": 8.729539103047509e-05, "loss": 0.0584, "step": 5370 }, { "epoch": 0.4295580661902671, "grad_norm": 0.23926559831019342, "learning_rate": 8.725103136228542e-05, "loss": 0.0412, "step": 5380 }, { "epoch": 0.4303565012575352, "grad_norm": 0.7753469657671935, "learning_rate": 8.720667169409573e-05, "loss": 0.0737, "step": 5390 }, { "epoch": 0.4311549363248034, "grad_norm": 0.6734499022356067, "learning_rate": 8.716231202590605e-05, "loss": 0.0749, "step": 5400 }, { "epoch": 0.4319533713920715, "grad_norm": 0.8067819775253303, "learning_rate": 8.711795235771637e-05, "loss": 0.0724, "step": 5410 }, { "epoch": 0.4327518064593397, "grad_norm": 0.3922789027943089, "learning_rate": 8.707359268952669e-05, "loss": 0.0714, "step": 5420 }, { "epoch": 0.43355024152660787, "grad_norm": 0.2643907438829241, "learning_rate": 8.7029233021337e-05, "loss": 0.0727, "step": 5430 }, { "epoch": 0.434348676593876, "grad_norm": 0.22125726238970103, "learning_rate": 8.698487335314732e-05, "loss": 0.0731, "step": 5440 }, { "epoch": 0.43514711166114417, "grad_norm": 0.5787553670066385, "learning_rate": 8.694051368495764e-05, "loss": 0.0699, "step": 5450 }, { "epoch": 0.4359455467284123, "grad_norm": 0.9740759075600869, "learning_rate": 8.689615401676796e-05, "loss": 0.0586, "step": 5460 }, { "epoch": 0.43674398179568047, "grad_norm": 0.5602442417248852, "learning_rate": 8.685179434857827e-05, "loss": 0.0517, "step": 5470 }, { "epoch": 0.43754241686294865, "grad_norm": 0.678676901103541, "learning_rate": 8.68074346803886e-05, "loss": 0.0557, "step": 5480 }, { "epoch": 0.43834085193021677, "grad_norm": 1.3188535290570824, "learning_rate": 8.676307501219891e-05, "loss": 0.0699, "step": 5490 }, { "epoch": 0.43913928699748495, "grad_norm": 1.2435485858474957, "learning_rate": 8.671871534400923e-05, "loss": 0.0907, "step": 5500 }, { "epoch": 0.43993772206475307, "grad_norm": 0.5531882667257373, "learning_rate": 8.667435567581956e-05, "loss": 0.0534, "step": 5510 }, { "epoch": 0.44073615713202124, "grad_norm": 0.209508564195029, "learning_rate": 8.662999600762986e-05, "loss": 0.0593, "step": 5520 }, { "epoch": 0.44153459219928937, "grad_norm": 0.40802476016406686, "learning_rate": 8.658563633944019e-05, "loss": 0.0497, "step": 5530 }, { "epoch": 0.44233302726655754, "grad_norm": 0.5003442540085888, "learning_rate": 8.654127667125051e-05, "loss": 0.0528, "step": 5540 }, { "epoch": 0.4431314623338257, "grad_norm": 0.5347815858537798, "learning_rate": 8.649691700306081e-05, "loss": 0.0484, "step": 5550 }, { "epoch": 0.44392989740109384, "grad_norm": 0.21195410850548047, "learning_rate": 8.645255733487114e-05, "loss": 0.0411, "step": 5560 }, { "epoch": 0.444728332468362, "grad_norm": 0.33320854722031934, "learning_rate": 8.640819766668146e-05, "loss": 0.0441, "step": 5570 }, { "epoch": 0.44552676753563014, "grad_norm": 0.5977688428709171, "learning_rate": 8.636383799849177e-05, "loss": 0.0711, "step": 5580 }, { "epoch": 0.4463252026028983, "grad_norm": 0.8369268611940605, "learning_rate": 8.63194783303021e-05, "loss": 0.0525, "step": 5590 }, { "epoch": 0.4471236376701665, "grad_norm": 0.3805775222554737, "learning_rate": 8.627511866211241e-05, "loss": 0.0602, "step": 5600 }, { "epoch": 0.4479220727374346, "grad_norm": 0.8957144762619743, "learning_rate": 8.623075899392273e-05, "loss": 0.0532, "step": 5610 }, { "epoch": 0.4487205078047028, "grad_norm": 0.4652900799065469, "learning_rate": 8.618639932573305e-05, "loss": 0.0717, "step": 5620 }, { "epoch": 0.4495189428719709, "grad_norm": 0.37729025821964557, "learning_rate": 8.614203965754337e-05, "loss": 0.0688, "step": 5630 }, { "epoch": 0.4503173779392391, "grad_norm": 1.040481843266623, "learning_rate": 8.609767998935368e-05, "loss": 0.0445, "step": 5640 }, { "epoch": 0.45111581300650727, "grad_norm": 0.3330126604993848, "learning_rate": 8.6053320321164e-05, "loss": 0.065, "step": 5650 }, { "epoch": 0.4519142480737754, "grad_norm": 0.543884899784255, "learning_rate": 8.600896065297432e-05, "loss": 0.0601, "step": 5660 }, { "epoch": 0.45271268314104357, "grad_norm": 0.2653124950369882, "learning_rate": 8.596460098478463e-05, "loss": 0.0545, "step": 5670 }, { "epoch": 0.4535111182083117, "grad_norm": 0.6739101914855006, "learning_rate": 8.592024131659495e-05, "loss": 0.0401, "step": 5680 }, { "epoch": 0.45430955327557987, "grad_norm": 0.2050892659042918, "learning_rate": 8.587588164840527e-05, "loss": 0.0647, "step": 5690 }, { "epoch": 0.455107988342848, "grad_norm": 0.5184564770421916, "learning_rate": 8.583152198021559e-05, "loss": 0.0566, "step": 5700 }, { "epoch": 0.45590642341011617, "grad_norm": 0.4895769280806872, "learning_rate": 8.578716231202592e-05, "loss": 0.0434, "step": 5710 }, { "epoch": 0.45670485847738435, "grad_norm": 0.14379874304204618, "learning_rate": 8.574280264383622e-05, "loss": 0.0379, "step": 5720 }, { "epoch": 0.45750329354465247, "grad_norm": 0.5671708658830928, "learning_rate": 8.569844297564654e-05, "loss": 0.0436, "step": 5730 }, { "epoch": 0.45830172861192064, "grad_norm": 0.7415929057111613, "learning_rate": 8.565408330745687e-05, "loss": 0.0713, "step": 5740 }, { "epoch": 0.45910016367918877, "grad_norm": 0.8550707763593818, "learning_rate": 8.560972363926719e-05, "loss": 0.0788, "step": 5750 }, { "epoch": 0.45989859874645694, "grad_norm": 0.723475303117586, "learning_rate": 8.55653639710775e-05, "loss": 0.0496, "step": 5760 }, { "epoch": 0.4606970338137251, "grad_norm": 0.10177492691267885, "learning_rate": 8.552100430288782e-05, "loss": 0.0523, "step": 5770 }, { "epoch": 0.46149546888099324, "grad_norm": 0.5119863271676398, "learning_rate": 8.547664463469814e-05, "loss": 0.0842, "step": 5780 }, { "epoch": 0.4622939039482614, "grad_norm": 0.8375508714525854, "learning_rate": 8.543228496650846e-05, "loss": 0.0751, "step": 5790 }, { "epoch": 0.46309233901552954, "grad_norm": 0.3543710763686105, "learning_rate": 8.538792529831877e-05, "loss": 0.0626, "step": 5800 }, { "epoch": 0.4638907740827977, "grad_norm": 0.21516304688172944, "learning_rate": 8.534356563012909e-05, "loss": 0.0556, "step": 5810 }, { "epoch": 0.4646892091500659, "grad_norm": 0.5369649304258546, "learning_rate": 8.529920596193941e-05, "loss": 0.0924, "step": 5820 }, { "epoch": 0.465487644217334, "grad_norm": 0.16686767153061297, "learning_rate": 8.525484629374973e-05, "loss": 0.0498, "step": 5830 }, { "epoch": 0.4662860792846022, "grad_norm": 0.5692944993426537, "learning_rate": 8.521048662556004e-05, "loss": 0.0618, "step": 5840 }, { "epoch": 0.4670845143518703, "grad_norm": 0.2500282256324541, "learning_rate": 8.516612695737036e-05, "loss": 0.0572, "step": 5850 }, { "epoch": 0.4678829494191385, "grad_norm": 1.0243874711555592, "learning_rate": 8.512176728918068e-05, "loss": 0.0697, "step": 5860 }, { "epoch": 0.4686813844864066, "grad_norm": 1.1440126889078868, "learning_rate": 8.5077407620991e-05, "loss": 0.0522, "step": 5870 }, { "epoch": 0.4694798195536748, "grad_norm": 0.6464834791986863, "learning_rate": 8.503304795280131e-05, "loss": 0.0495, "step": 5880 }, { "epoch": 0.47027825462094297, "grad_norm": 0.2987467279017007, "learning_rate": 8.498868828461164e-05, "loss": 0.051, "step": 5890 }, { "epoch": 0.4710766896882111, "grad_norm": 0.5446392106542469, "learning_rate": 8.494432861642195e-05, "loss": 0.07, "step": 5900 }, { "epoch": 0.47187512475547927, "grad_norm": 0.3319188507450066, "learning_rate": 8.489996894823227e-05, "loss": 0.0618, "step": 5910 }, { "epoch": 0.4726735598227474, "grad_norm": 0.17413701444289367, "learning_rate": 8.48556092800426e-05, "loss": 0.0615, "step": 5920 }, { "epoch": 0.47347199489001557, "grad_norm": 0.4660774684817729, "learning_rate": 8.48112496118529e-05, "loss": 0.0473, "step": 5930 }, { "epoch": 0.47427042995728375, "grad_norm": 0.31409912418756264, "learning_rate": 8.476688994366323e-05, "loss": 0.0352, "step": 5940 }, { "epoch": 0.47506886502455187, "grad_norm": 0.38023200269157886, "learning_rate": 8.472253027547355e-05, "loss": 0.0747, "step": 5950 }, { "epoch": 0.47586730009182004, "grad_norm": 1.0388339923063081, "learning_rate": 8.467817060728385e-05, "loss": 0.0711, "step": 5960 }, { "epoch": 0.47666573515908817, "grad_norm": 0.4483516898214409, "learning_rate": 8.463381093909418e-05, "loss": 0.0828, "step": 5970 }, { "epoch": 0.47746417022635634, "grad_norm": 1.308927326688913, "learning_rate": 8.45894512709045e-05, "loss": 0.0692, "step": 5980 }, { "epoch": 0.4782626052936245, "grad_norm": 0.8389767702919784, "learning_rate": 8.454509160271482e-05, "loss": 0.0501, "step": 5990 }, { "epoch": 0.47906104036089264, "grad_norm": 0.34836370344580164, "learning_rate": 8.450073193452514e-05, "loss": 0.0586, "step": 6000 }, { "epoch": 0.4798594754281608, "grad_norm": 0.4295239557610677, "learning_rate": 8.445637226633545e-05, "loss": 0.0675, "step": 6010 }, { "epoch": 0.48065791049542894, "grad_norm": 0.35036280574185724, "learning_rate": 8.441201259814577e-05, "loss": 0.0624, "step": 6020 }, { "epoch": 0.4814563455626971, "grad_norm": 0.44702820482053074, "learning_rate": 8.436765292995609e-05, "loss": 0.0694, "step": 6030 }, { "epoch": 0.48225478062996524, "grad_norm": 0.6531156283955792, "learning_rate": 8.43232932617664e-05, "loss": 0.0765, "step": 6040 }, { "epoch": 0.4830532156972334, "grad_norm": 0.4330465340758785, "learning_rate": 8.427893359357672e-05, "loss": 0.0667, "step": 6050 }, { "epoch": 0.4838516507645016, "grad_norm": 0.806120927975259, "learning_rate": 8.423457392538704e-05, "loss": 0.0624, "step": 6060 }, { "epoch": 0.4846500858317697, "grad_norm": 1.1472878212957334, "learning_rate": 8.419021425719736e-05, "loss": 0.0524, "step": 6070 }, { "epoch": 0.4854485208990379, "grad_norm": 1.0107019337484981, "learning_rate": 8.414585458900767e-05, "loss": 0.0919, "step": 6080 }, { "epoch": 0.486246955966306, "grad_norm": 0.43287809560013385, "learning_rate": 8.410149492081799e-05, "loss": 0.0825, "step": 6090 }, { "epoch": 0.4870453910335742, "grad_norm": 0.751653585498906, "learning_rate": 8.405713525262832e-05, "loss": 0.064, "step": 6100 }, { "epoch": 0.48784382610084237, "grad_norm": 1.279229861370186, "learning_rate": 8.401277558443863e-05, "loss": 0.0678, "step": 6110 }, { "epoch": 0.4886422611681105, "grad_norm": 0.7924618650401917, "learning_rate": 8.396841591624896e-05, "loss": 0.0619, "step": 6120 }, { "epoch": 0.48944069623537867, "grad_norm": 1.7361339434442595, "learning_rate": 8.392405624805928e-05, "loss": 0.0816, "step": 6130 }, { "epoch": 0.4902391313026468, "grad_norm": 0.7861976029319782, "learning_rate": 8.387969657986958e-05, "loss": 0.0939, "step": 6140 }, { "epoch": 0.49103756636991497, "grad_norm": 0.7619765245860369, "learning_rate": 8.383533691167991e-05, "loss": 0.0813, "step": 6150 }, { "epoch": 0.49183600143718315, "grad_norm": 0.5114292307220892, "learning_rate": 8.379097724349023e-05, "loss": 0.0616, "step": 6160 }, { "epoch": 0.49263443650445127, "grad_norm": 0.1391537940982425, "learning_rate": 8.374661757530054e-05, "loss": 0.0427, "step": 6170 }, { "epoch": 0.49343287157171944, "grad_norm": 0.6944831169186162, "learning_rate": 8.370225790711086e-05, "loss": 0.0767, "step": 6180 }, { "epoch": 0.49423130663898757, "grad_norm": 0.3894128639134844, "learning_rate": 8.365789823892118e-05, "loss": 0.0585, "step": 6190 }, { "epoch": 0.49502974170625574, "grad_norm": 0.9576661297036134, "learning_rate": 8.36135385707315e-05, "loss": 0.0897, "step": 6200 }, { "epoch": 0.49582817677352387, "grad_norm": 0.45017824313200255, "learning_rate": 8.356917890254181e-05, "loss": 0.0551, "step": 6210 }, { "epoch": 0.49662661184079204, "grad_norm": 0.578217231479692, "learning_rate": 8.352481923435213e-05, "loss": 0.0675, "step": 6220 }, { "epoch": 0.4974250469080602, "grad_norm": 0.8903703995378134, "learning_rate": 8.348045956616245e-05, "loss": 0.0682, "step": 6230 }, { "epoch": 0.49822348197532834, "grad_norm": 0.14260077092570403, "learning_rate": 8.343609989797277e-05, "loss": 0.0621, "step": 6240 }, { "epoch": 0.4990219170425965, "grad_norm": 0.8075310070857461, "learning_rate": 8.339174022978308e-05, "loss": 0.061, "step": 6250 }, { "epoch": 0.49982035210986464, "grad_norm": 0.7242461553616462, "learning_rate": 8.33473805615934e-05, "loss": 0.0927, "step": 6260 }, { "epoch": 0.5006187871771328, "grad_norm": 0.2005813623355205, "learning_rate": 8.330302089340372e-05, "loss": 0.0779, "step": 6270 }, { "epoch": 0.501417222244401, "grad_norm": 0.22869071139284775, "learning_rate": 8.325866122521404e-05, "loss": 0.0329, "step": 6280 }, { "epoch": 0.5022156573116692, "grad_norm": 2.3083924183194178, "learning_rate": 8.321430155702435e-05, "loss": 0.0619, "step": 6290 }, { "epoch": 0.5030140923789372, "grad_norm": 0.7434249385683861, "learning_rate": 8.316994188883468e-05, "loss": 0.0703, "step": 6300 }, { "epoch": 0.5038125274462054, "grad_norm": 0.9138191544723026, "learning_rate": 8.312558222064499e-05, "loss": 0.0577, "step": 6310 }, { "epoch": 0.5046109625134736, "grad_norm": 0.25653158436649787, "learning_rate": 8.30812225524553e-05, "loss": 0.0478, "step": 6320 }, { "epoch": 0.5054093975807418, "grad_norm": 0.5620053416520807, "learning_rate": 8.303686288426564e-05, "loss": 0.0481, "step": 6330 }, { "epoch": 0.50620783264801, "grad_norm": 0.1276357272302078, "learning_rate": 8.299250321607594e-05, "loss": 0.0479, "step": 6340 }, { "epoch": 0.507006267715278, "grad_norm": 0.60544258687383, "learning_rate": 8.294814354788627e-05, "loss": 0.0738, "step": 6350 }, { "epoch": 0.5078047027825462, "grad_norm": 0.4988768974466748, "learning_rate": 8.290378387969659e-05, "loss": 0.0602, "step": 6360 }, { "epoch": 0.5086031378498144, "grad_norm": 0.8380837716515054, "learning_rate": 8.285942421150689e-05, "loss": 0.0515, "step": 6370 }, { "epoch": 0.5094015729170825, "grad_norm": 0.2649314561466298, "learning_rate": 8.281506454331722e-05, "loss": 0.0526, "step": 6380 }, { "epoch": 0.5102000079843507, "grad_norm": 0.16309396798317682, "learning_rate": 8.277070487512754e-05, "loss": 0.0555, "step": 6390 }, { "epoch": 0.5109984430516188, "grad_norm": 1.2443648203406286, "learning_rate": 8.272634520693786e-05, "loss": 0.0613, "step": 6400 }, { "epoch": 0.511796878118887, "grad_norm": 0.6419294388683323, "learning_rate": 8.268198553874818e-05, "loss": 0.0568, "step": 6410 }, { "epoch": 0.5125953131861551, "grad_norm": 0.29889506341617506, "learning_rate": 8.263762587055849e-05, "loss": 0.0582, "step": 6420 }, { "epoch": 0.5133937482534233, "grad_norm": 0.6487256851752289, "learning_rate": 8.259326620236881e-05, "loss": 0.0562, "step": 6430 }, { "epoch": 0.5141921833206915, "grad_norm": 0.4604599082658974, "learning_rate": 8.254890653417913e-05, "loss": 0.0535, "step": 6440 }, { "epoch": 0.5149906183879596, "grad_norm": 0.5571572469431415, "learning_rate": 8.250454686598946e-05, "loss": 0.0595, "step": 6450 }, { "epoch": 0.5157890534552277, "grad_norm": 0.16286352259235828, "learning_rate": 8.246018719779976e-05, "loss": 0.0645, "step": 6460 }, { "epoch": 0.5165874885224959, "grad_norm": 0.2423915931929422, "learning_rate": 8.241582752961008e-05, "loss": 0.04, "step": 6470 }, { "epoch": 0.5173859235897641, "grad_norm": 0.12424351376419095, "learning_rate": 8.237146786142041e-05, "loss": 0.0663, "step": 6480 }, { "epoch": 0.5181843586570322, "grad_norm": 0.72651821446849, "learning_rate": 8.232710819323071e-05, "loss": 0.0522, "step": 6490 }, { "epoch": 0.5189827937243003, "grad_norm": 0.7950904269617449, "learning_rate": 8.228274852504103e-05, "loss": 0.0492, "step": 6500 }, { "epoch": 0.5197812287915685, "grad_norm": 0.5747491220221507, "learning_rate": 8.223838885685136e-05, "loss": 0.0844, "step": 6510 }, { "epoch": 0.5205796638588367, "grad_norm": 0.2782512901079484, "learning_rate": 8.219402918866167e-05, "loss": 0.0475, "step": 6520 }, { "epoch": 0.5213780989261049, "grad_norm": 0.2648292458710825, "learning_rate": 8.2149669520472e-05, "loss": 0.0544, "step": 6530 }, { "epoch": 0.5221765339933729, "grad_norm": 0.19105815570303936, "learning_rate": 8.210530985228231e-05, "loss": 0.0653, "step": 6540 }, { "epoch": 0.5229749690606411, "grad_norm": 0.46407076378758166, "learning_rate": 8.206095018409262e-05, "loss": 0.0819, "step": 6550 }, { "epoch": 0.5237734041279093, "grad_norm": 0.12564468042648594, "learning_rate": 8.201659051590295e-05, "loss": 0.0684, "step": 6560 }, { "epoch": 0.5245718391951775, "grad_norm": 0.62639241054009, "learning_rate": 8.197223084771327e-05, "loss": 0.0632, "step": 6570 }, { "epoch": 0.5253702742624456, "grad_norm": 0.3230944239837902, "learning_rate": 8.192787117952358e-05, "loss": 0.0477, "step": 6580 }, { "epoch": 0.5261687093297137, "grad_norm": 0.9896576099526085, "learning_rate": 8.18835115113339e-05, "loss": 0.0764, "step": 6590 }, { "epoch": 0.5269671443969819, "grad_norm": 0.28160344356170797, "learning_rate": 8.183915184314422e-05, "loss": 0.0542, "step": 6600 }, { "epoch": 0.5277655794642501, "grad_norm": 1.683063453242496, "learning_rate": 8.179479217495454e-05, "loss": 0.0467, "step": 6610 }, { "epoch": 0.5285640145315182, "grad_norm": 0.17755025572391545, "learning_rate": 8.175043250676485e-05, "loss": 0.0566, "step": 6620 }, { "epoch": 0.5293624495987864, "grad_norm": 0.45573443555969606, "learning_rate": 8.170607283857517e-05, "loss": 0.0677, "step": 6630 }, { "epoch": 0.5301608846660545, "grad_norm": 0.1575084916438133, "learning_rate": 8.166171317038549e-05, "loss": 0.0575, "step": 6640 }, { "epoch": 0.5309593197333227, "grad_norm": 1.2877144807985812, "learning_rate": 8.16173535021958e-05, "loss": 0.0678, "step": 6650 }, { "epoch": 0.5317577548005908, "grad_norm": 0.18522169090671942, "learning_rate": 8.157299383400612e-05, "loss": 0.0604, "step": 6660 }, { "epoch": 0.532556189867859, "grad_norm": 0.7266885221486851, "learning_rate": 8.152863416581644e-05, "loss": 0.0681, "step": 6670 }, { "epoch": 0.5333546249351272, "grad_norm": 0.12652671512749664, "learning_rate": 8.148427449762677e-05, "loss": 0.0538, "step": 6680 }, { "epoch": 0.5341530600023953, "grad_norm": 0.9739164827650703, "learning_rate": 8.143991482943708e-05, "loss": 0.0836, "step": 6690 }, { "epoch": 0.5349514950696634, "grad_norm": 0.17376810669729617, "learning_rate": 8.139555516124739e-05, "loss": 0.0889, "step": 6700 }, { "epoch": 0.5357499301369316, "grad_norm": 0.6641373869163394, "learning_rate": 8.135119549305772e-05, "loss": 0.0764, "step": 6710 }, { "epoch": 0.5365483652041998, "grad_norm": 0.16815120710505263, "learning_rate": 8.130683582486803e-05, "loss": 0.0672, "step": 6720 }, { "epoch": 0.537346800271468, "grad_norm": 0.5796326559620748, "learning_rate": 8.126247615667834e-05, "loss": 0.078, "step": 6730 }, { "epoch": 0.538145235338736, "grad_norm": 0.1412822636172128, "learning_rate": 8.121811648848868e-05, "loss": 0.0652, "step": 6740 }, { "epoch": 0.5389436704060042, "grad_norm": 0.18003108883859614, "learning_rate": 8.117375682029898e-05, "loss": 0.0508, "step": 6750 }, { "epoch": 0.5397421054732724, "grad_norm": 0.36894859982760714, "learning_rate": 8.112939715210931e-05, "loss": 0.0612, "step": 6760 }, { "epoch": 0.5405405405405406, "grad_norm": 0.2407377212282452, "learning_rate": 8.108503748391963e-05, "loss": 0.0435, "step": 6770 }, { "epoch": 0.5413389756078087, "grad_norm": 1.352276758332171, "learning_rate": 8.104067781572993e-05, "loss": 0.0893, "step": 6780 }, { "epoch": 0.5421374106750768, "grad_norm": 0.6181733852818506, "learning_rate": 8.099631814754026e-05, "loss": 0.0631, "step": 6790 }, { "epoch": 0.542935845742345, "grad_norm": 0.510119508066543, "learning_rate": 8.095195847935058e-05, "loss": 0.065, "step": 6800 }, { "epoch": 0.5437342808096132, "grad_norm": 0.33556736143265536, "learning_rate": 8.09075988111609e-05, "loss": 0.052, "step": 6810 }, { "epoch": 0.5445327158768813, "grad_norm": 0.23362913712809788, "learning_rate": 8.086323914297121e-05, "loss": 0.06, "step": 6820 }, { "epoch": 0.5453311509441494, "grad_norm": 1.0108831673519008, "learning_rate": 8.081887947478153e-05, "loss": 0.1047, "step": 6830 }, { "epoch": 0.5461295860114176, "grad_norm": 0.9061380750994878, "learning_rate": 8.077451980659185e-05, "loss": 0.0683, "step": 6840 }, { "epoch": 0.5469280210786858, "grad_norm": 0.42524090606390924, "learning_rate": 8.073016013840217e-05, "loss": 0.0577, "step": 6850 }, { "epoch": 0.5477264561459539, "grad_norm": 0.9112083691474904, "learning_rate": 8.06858004702125e-05, "loss": 0.0628, "step": 6860 }, { "epoch": 0.5485248912132221, "grad_norm": 0.343851767869507, "learning_rate": 8.06414408020228e-05, "loss": 0.0411, "step": 6870 }, { "epoch": 0.5493233262804902, "grad_norm": 0.26288157493297026, "learning_rate": 8.059708113383312e-05, "loss": 0.0884, "step": 6880 }, { "epoch": 0.5501217613477584, "grad_norm": 0.672374580318619, "learning_rate": 8.055272146564345e-05, "loss": 0.0575, "step": 6890 }, { "epoch": 0.5509201964150265, "grad_norm": 0.25631627581337185, "learning_rate": 8.050836179745375e-05, "loss": 0.0367, "step": 6900 }, { "epoch": 0.5517186314822947, "grad_norm": 0.459897810225924, "learning_rate": 8.046400212926408e-05, "loss": 0.0782, "step": 6910 }, { "epoch": 0.5525170665495629, "grad_norm": 0.5999621078607194, "learning_rate": 8.04196424610744e-05, "loss": 0.075, "step": 6920 }, { "epoch": 0.553315501616831, "grad_norm": 0.23982515518970804, "learning_rate": 8.03752827928847e-05, "loss": 0.0424, "step": 6930 }, { "epoch": 0.5541139366840991, "grad_norm": 0.23837663059539815, "learning_rate": 8.033092312469504e-05, "loss": 0.0547, "step": 6940 }, { "epoch": 0.5549123717513673, "grad_norm": 0.46337228232386396, "learning_rate": 8.028656345650535e-05, "loss": 0.0671, "step": 6950 }, { "epoch": 0.5557108068186355, "grad_norm": 2.0716745224894217, "learning_rate": 8.024220378831566e-05, "loss": 0.0961, "step": 6960 }, { "epoch": 0.5565092418859037, "grad_norm": 0.38258068674015955, "learning_rate": 8.019784412012599e-05, "loss": 0.0641, "step": 6970 }, { "epoch": 0.5573076769531717, "grad_norm": 0.6422500771717725, "learning_rate": 8.01534844519363e-05, "loss": 0.0682, "step": 6980 }, { "epoch": 0.5581061120204399, "grad_norm": 0.6046415591096082, "learning_rate": 8.010912478374662e-05, "loss": 0.0972, "step": 6990 }, { "epoch": 0.5589045470877081, "grad_norm": 0.8616981377908823, "learning_rate": 8.006476511555694e-05, "loss": 0.0746, "step": 7000 }, { "epoch": 0.5597029821549763, "grad_norm": 0.24097579377088618, "learning_rate": 8.002040544736726e-05, "loss": 0.0751, "step": 7010 }, { "epoch": 0.5605014172222444, "grad_norm": 0.6522706315596682, "learning_rate": 7.997604577917758e-05, "loss": 0.0471, "step": 7020 }, { "epoch": 0.5612998522895125, "grad_norm": 1.2638941405843032, "learning_rate": 7.99316861109879e-05, "loss": 0.0571, "step": 7030 }, { "epoch": 0.5620982873567807, "grad_norm": 0.1313943832208323, "learning_rate": 7.988732644279821e-05, "loss": 0.0374, "step": 7040 }, { "epoch": 0.5628967224240489, "grad_norm": 0.6033488591473235, "learning_rate": 7.984296677460853e-05, "loss": 0.0527, "step": 7050 }, { "epoch": 0.563695157491317, "grad_norm": 0.19916184688760954, "learning_rate": 7.979860710641885e-05, "loss": 0.0578, "step": 7060 }, { "epoch": 0.5644935925585852, "grad_norm": 0.5973224327114647, "learning_rate": 7.975424743822916e-05, "loss": 0.066, "step": 7070 }, { "epoch": 0.5652920276258533, "grad_norm": 0.4957434283307288, "learning_rate": 7.970988777003948e-05, "loss": 0.0496, "step": 7080 }, { "epoch": 0.5660904626931215, "grad_norm": 0.7014221037795486, "learning_rate": 7.966552810184981e-05, "loss": 0.0536, "step": 7090 }, { "epoch": 0.5668888977603896, "grad_norm": 0.2942948589315806, "learning_rate": 7.962116843366012e-05, "loss": 0.0631, "step": 7100 }, { "epoch": 0.5676873328276578, "grad_norm": 0.6122650378671805, "learning_rate": 7.957680876547043e-05, "loss": 0.0758, "step": 7110 }, { "epoch": 0.568485767894926, "grad_norm": 0.06698143407689137, "learning_rate": 7.953244909728076e-05, "loss": 0.0487, "step": 7120 }, { "epoch": 0.5692842029621941, "grad_norm": 0.49897005838474545, "learning_rate": 7.948808942909107e-05, "loss": 0.0745, "step": 7130 }, { "epoch": 0.5700826380294622, "grad_norm": 0.12126764090029286, "learning_rate": 7.94437297609014e-05, "loss": 0.0491, "step": 7140 }, { "epoch": 0.5708810730967304, "grad_norm": 0.6765414621536705, "learning_rate": 7.939937009271172e-05, "loss": 0.0707, "step": 7150 }, { "epoch": 0.5716795081639986, "grad_norm": 0.7488705487173876, "learning_rate": 7.935501042452202e-05, "loss": 0.0707, "step": 7160 }, { "epoch": 0.5724779432312668, "grad_norm": 0.4396172085640616, "learning_rate": 7.931065075633235e-05, "loss": 0.0566, "step": 7170 }, { "epoch": 0.5732763782985348, "grad_norm": 0.8264001543643991, "learning_rate": 7.926629108814267e-05, "loss": 0.0741, "step": 7180 }, { "epoch": 0.574074813365803, "grad_norm": 0.34922891662042377, "learning_rate": 7.922193141995297e-05, "loss": 0.0605, "step": 7190 }, { "epoch": 0.5748732484330712, "grad_norm": 1.2243612635366266, "learning_rate": 7.91775717517633e-05, "loss": 0.093, "step": 7200 }, { "epoch": 0.5756716835003394, "grad_norm": 0.7752867030859307, "learning_rate": 7.913321208357362e-05, "loss": 0.0735, "step": 7210 }, { "epoch": 0.5764701185676074, "grad_norm": 0.260444765912474, "learning_rate": 7.908885241538394e-05, "loss": 0.0618, "step": 7220 }, { "epoch": 0.5772685536348756, "grad_norm": 0.20684347358549432, "learning_rate": 7.904449274719425e-05, "loss": 0.0528, "step": 7230 }, { "epoch": 0.5780669887021438, "grad_norm": 0.5262456398761236, "learning_rate": 7.900013307900457e-05, "loss": 0.0702, "step": 7240 }, { "epoch": 0.578865423769412, "grad_norm": 0.4831605388391065, "learning_rate": 7.895577341081489e-05, "loss": 0.0573, "step": 7250 }, { "epoch": 0.5796638588366801, "grad_norm": 1.350413669611844, "learning_rate": 7.89114137426252e-05, "loss": 0.0811, "step": 7260 }, { "epoch": 0.5804622939039482, "grad_norm": 0.17202686615065826, "learning_rate": 7.886705407443554e-05, "loss": 0.0662, "step": 7270 }, { "epoch": 0.5812607289712164, "grad_norm": 0.3215578422459197, "learning_rate": 7.882269440624584e-05, "loss": 0.0511, "step": 7280 }, { "epoch": 0.5820591640384846, "grad_norm": 0.4914632124251946, "learning_rate": 7.877833473805616e-05, "loss": 0.0976, "step": 7290 }, { "epoch": 0.5828575991057527, "grad_norm": 0.6502099273735706, "learning_rate": 7.873397506986649e-05, "loss": 0.0552, "step": 7300 }, { "epoch": 0.5836560341730209, "grad_norm": 0.4750500443154491, "learning_rate": 7.86896154016768e-05, "loss": 0.0637, "step": 7310 }, { "epoch": 0.584454469240289, "grad_norm": 0.2973395613219485, "learning_rate": 7.864525573348712e-05, "loss": 0.0625, "step": 7320 }, { "epoch": 0.5852529043075572, "grad_norm": 0.27458996393587615, "learning_rate": 7.860089606529744e-05, "loss": 0.0637, "step": 7330 }, { "epoch": 0.5860513393748253, "grad_norm": 0.8209883237591991, "learning_rate": 7.855653639710775e-05, "loss": 0.0745, "step": 7340 }, { "epoch": 0.5868497744420935, "grad_norm": 0.9682766674817845, "learning_rate": 7.851217672891808e-05, "loss": 0.0688, "step": 7350 }, { "epoch": 0.5876482095093617, "grad_norm": 0.2958448984803725, "learning_rate": 7.84678170607284e-05, "loss": 0.0908, "step": 7360 }, { "epoch": 0.5884466445766298, "grad_norm": 0.935616415881285, "learning_rate": 7.842345739253871e-05, "loss": 0.0432, "step": 7370 }, { "epoch": 0.5892450796438979, "grad_norm": 0.42683926949066886, "learning_rate": 7.837909772434903e-05, "loss": 0.0724, "step": 7380 }, { "epoch": 0.5900435147111661, "grad_norm": 0.14183843181607633, "learning_rate": 7.833473805615935e-05, "loss": 0.0511, "step": 7390 }, { "epoch": 0.5908419497784343, "grad_norm": 0.5699310686505371, "learning_rate": 7.829037838796966e-05, "loss": 0.0871, "step": 7400 }, { "epoch": 0.5916403848457025, "grad_norm": 0.23883922920971742, "learning_rate": 7.824601871977998e-05, "loss": 0.0579, "step": 7410 }, { "epoch": 0.5924388199129705, "grad_norm": 0.6123424382050415, "learning_rate": 7.82016590515903e-05, "loss": 0.0864, "step": 7420 }, { "epoch": 0.5932372549802387, "grad_norm": 0.39849019500035515, "learning_rate": 7.815729938340062e-05, "loss": 0.0506, "step": 7430 }, { "epoch": 0.5940356900475069, "grad_norm": 0.4663347129336559, "learning_rate": 7.811293971521093e-05, "loss": 0.0574, "step": 7440 }, { "epoch": 0.5948341251147751, "grad_norm": 0.7471883648747045, "learning_rate": 7.806858004702125e-05, "loss": 0.0522, "step": 7450 }, { "epoch": 0.5956325601820432, "grad_norm": 0.651546517935869, "learning_rate": 7.802422037883157e-05, "loss": 0.062, "step": 7460 }, { "epoch": 0.5964309952493113, "grad_norm": 0.2860666478175177, "learning_rate": 7.797986071064189e-05, "loss": 0.0718, "step": 7470 }, { "epoch": 0.5972294303165795, "grad_norm": 0.6037593345648162, "learning_rate": 7.79355010424522e-05, "loss": 0.0519, "step": 7480 }, { "epoch": 0.5980278653838477, "grad_norm": 0.3900292909701525, "learning_rate": 7.789114137426252e-05, "loss": 0.0514, "step": 7490 }, { "epoch": 0.5988263004511158, "grad_norm": 0.2854211108781873, "learning_rate": 7.784678170607285e-05, "loss": 0.0544, "step": 7500 }, { "epoch": 0.599624735518384, "grad_norm": 0.8380500971235885, "learning_rate": 7.780242203788315e-05, "loss": 0.0511, "step": 7510 }, { "epoch": 0.6004231705856521, "grad_norm": 0.7573018775305534, "learning_rate": 7.775806236969347e-05, "loss": 0.0876, "step": 7520 }, { "epoch": 0.6012216056529203, "grad_norm": 0.26249122345970394, "learning_rate": 7.77137027015038e-05, "loss": 0.0785, "step": 7530 }, { "epoch": 0.6020200407201884, "grad_norm": 0.8173089098567144, "learning_rate": 7.766934303331411e-05, "loss": 0.0628, "step": 7540 }, { "epoch": 0.6028184757874566, "grad_norm": 0.4551514955773799, "learning_rate": 7.762498336512444e-05, "loss": 0.0564, "step": 7550 }, { "epoch": 0.6036169108547247, "grad_norm": 0.7281675170313439, "learning_rate": 7.758062369693476e-05, "loss": 0.0558, "step": 7560 }, { "epoch": 0.6044153459219929, "grad_norm": 0.729468984681738, "learning_rate": 7.753626402874506e-05, "loss": 0.0673, "step": 7570 }, { "epoch": 0.605213780989261, "grad_norm": 1.175985724771846, "learning_rate": 7.749190436055539e-05, "loss": 0.0915, "step": 7580 }, { "epoch": 0.6060122160565292, "grad_norm": 0.2252551084028221, "learning_rate": 7.744754469236571e-05, "loss": 0.059, "step": 7590 }, { "epoch": 0.6068106511237974, "grad_norm": 0.6300017871560232, "learning_rate": 7.740318502417602e-05, "loss": 0.0605, "step": 7600 }, { "epoch": 0.6076090861910655, "grad_norm": 0.4962012860136617, "learning_rate": 7.735882535598634e-05, "loss": 0.0664, "step": 7610 }, { "epoch": 0.6084075212583336, "grad_norm": 0.36523306801124467, "learning_rate": 7.731446568779666e-05, "loss": 0.0507, "step": 7620 }, { "epoch": 0.6092059563256018, "grad_norm": 0.7522576930476144, "learning_rate": 7.727010601960698e-05, "loss": 0.0506, "step": 7630 }, { "epoch": 0.61000439139287, "grad_norm": 0.25074975014272083, "learning_rate": 7.72257463514173e-05, "loss": 0.0395, "step": 7640 }, { "epoch": 0.6108028264601382, "grad_norm": 0.5220421408073481, "learning_rate": 7.718138668322761e-05, "loss": 0.0738, "step": 7650 }, { "epoch": 0.6116012615274062, "grad_norm": 0.44417669645567104, "learning_rate": 7.713702701503793e-05, "loss": 0.0601, "step": 7660 }, { "epoch": 0.6123996965946744, "grad_norm": 0.8061882748059278, "learning_rate": 7.709266734684825e-05, "loss": 0.0491, "step": 7670 }, { "epoch": 0.6131981316619426, "grad_norm": 2.5447832388331877, "learning_rate": 7.704830767865858e-05, "loss": 0.0565, "step": 7680 }, { "epoch": 0.6139965667292108, "grad_norm": 0.3301717640321505, "learning_rate": 7.700394801046888e-05, "loss": 0.0553, "step": 7690 }, { "epoch": 0.614795001796479, "grad_norm": 0.49796370625063435, "learning_rate": 7.69595883422792e-05, "loss": 0.0561, "step": 7700 }, { "epoch": 0.615593436863747, "grad_norm": 1.755807427152239, "learning_rate": 7.691522867408953e-05, "loss": 0.0687, "step": 7710 }, { "epoch": 0.6163918719310152, "grad_norm": 1.160415695120886, "learning_rate": 7.687086900589983e-05, "loss": 0.0556, "step": 7720 }, { "epoch": 0.6171903069982834, "grad_norm": 1.0085260825243685, "learning_rate": 7.682650933771016e-05, "loss": 0.069, "step": 7730 }, { "epoch": 0.6179887420655515, "grad_norm": 0.1306292147862706, "learning_rate": 7.678214966952048e-05, "loss": 0.055, "step": 7740 }, { "epoch": 0.6187871771328197, "grad_norm": 0.6485284615482432, "learning_rate": 7.673779000133079e-05, "loss": 0.0442, "step": 7750 }, { "epoch": 0.6195856122000878, "grad_norm": 0.4228073123981237, "learning_rate": 7.669343033314112e-05, "loss": 0.0764, "step": 7760 }, { "epoch": 0.620384047267356, "grad_norm": 0.23326275111725533, "learning_rate": 7.664907066495143e-05, "loss": 0.0464, "step": 7770 }, { "epoch": 0.6211824823346241, "grad_norm": 0.9561233426879092, "learning_rate": 7.660471099676175e-05, "loss": 0.0638, "step": 7780 }, { "epoch": 0.6219809174018923, "grad_norm": 0.6384667660627328, "learning_rate": 7.656035132857207e-05, "loss": 0.07, "step": 7790 }, { "epoch": 0.6227793524691605, "grad_norm": 0.8416825374497995, "learning_rate": 7.651599166038239e-05, "loss": 0.077, "step": 7800 }, { "epoch": 0.6235777875364286, "grad_norm": 1.0456515597273508, "learning_rate": 7.64716319921927e-05, "loss": 0.0667, "step": 7810 }, { "epoch": 0.6243762226036967, "grad_norm": 0.43237442544902527, "learning_rate": 7.642727232400302e-05, "loss": 0.0422, "step": 7820 }, { "epoch": 0.6251746576709649, "grad_norm": 0.5474360097858476, "learning_rate": 7.638291265581334e-05, "loss": 0.0567, "step": 7830 }, { "epoch": 0.6259730927382331, "grad_norm": 0.21149515091500953, "learning_rate": 7.633855298762366e-05, "loss": 0.0647, "step": 7840 }, { "epoch": 0.6267715278055013, "grad_norm": 0.381989832210318, "learning_rate": 7.629419331943397e-05, "loss": 0.0513, "step": 7850 }, { "epoch": 0.6275699628727693, "grad_norm": 0.5773680693611083, "learning_rate": 7.624983365124429e-05, "loss": 0.0638, "step": 7860 }, { "epoch": 0.6283683979400375, "grad_norm": 0.27172771417711034, "learning_rate": 7.620547398305461e-05, "loss": 0.0463, "step": 7870 }, { "epoch": 0.6291668330073057, "grad_norm": 1.2529735246413634, "learning_rate": 7.616111431486492e-05, "loss": 0.0711, "step": 7880 }, { "epoch": 0.6299652680745739, "grad_norm": 0.520377783745768, "learning_rate": 7.611675464667524e-05, "loss": 0.0638, "step": 7890 }, { "epoch": 0.6307637031418419, "grad_norm": 1.748282147554803, "learning_rate": 7.607239497848556e-05, "loss": 0.0604, "step": 7900 }, { "epoch": 0.6315621382091101, "grad_norm": 0.26800510354228496, "learning_rate": 7.602803531029589e-05, "loss": 0.0437, "step": 7910 }, { "epoch": 0.6323605732763783, "grad_norm": 1.4528358850229193, "learning_rate": 7.59836756421062e-05, "loss": 0.0592, "step": 7920 }, { "epoch": 0.6331590083436465, "grad_norm": 0.1173707058797108, "learning_rate": 7.593931597391651e-05, "loss": 0.0449, "step": 7930 }, { "epoch": 0.6339574434109146, "grad_norm": 0.218979297067727, "learning_rate": 7.589495630572684e-05, "loss": 0.0649, "step": 7940 }, { "epoch": 0.6347558784781827, "grad_norm": 0.18828056943739424, "learning_rate": 7.585059663753715e-05, "loss": 0.0496, "step": 7950 }, { "epoch": 0.6355543135454509, "grad_norm": 0.36105964997831, "learning_rate": 7.580623696934748e-05, "loss": 0.0779, "step": 7960 }, { "epoch": 0.6363527486127191, "grad_norm": 0.3860519463421298, "learning_rate": 7.57618773011578e-05, "loss": 0.0616, "step": 7970 }, { "epoch": 0.6371511836799872, "grad_norm": 0.37066091618679997, "learning_rate": 7.57175176329681e-05, "loss": 0.0694, "step": 7980 }, { "epoch": 0.6379496187472554, "grad_norm": 0.8566487456631784, "learning_rate": 7.567315796477843e-05, "loss": 0.0749, "step": 7990 }, { "epoch": 0.6387480538145235, "grad_norm": 0.4122089947136881, "learning_rate": 7.562879829658875e-05, "loss": 0.0558, "step": 8000 }, { "epoch": 0.6395464888817917, "grad_norm": 1.1227851605078236, "learning_rate": 7.558443862839906e-05, "loss": 0.0581, "step": 8010 }, { "epoch": 0.6403449239490598, "grad_norm": 0.4289686099198654, "learning_rate": 7.554007896020938e-05, "loss": 0.0576, "step": 8020 }, { "epoch": 0.641143359016328, "grad_norm": 0.6918225500592088, "learning_rate": 7.54957192920197e-05, "loss": 0.0783, "step": 8030 }, { "epoch": 0.6419417940835962, "grad_norm": 0.7921222085446677, "learning_rate": 7.545135962383002e-05, "loss": 0.0659, "step": 8040 }, { "epoch": 0.6427402291508643, "grad_norm": 0.555757166135947, "learning_rate": 7.540699995564033e-05, "loss": 0.0552, "step": 8050 }, { "epoch": 0.6435386642181324, "grad_norm": 0.6164912342935124, "learning_rate": 7.536264028745066e-05, "loss": 0.0488, "step": 8060 }, { "epoch": 0.6443370992854006, "grad_norm": 1.2094497855234292, "learning_rate": 7.531828061926097e-05, "loss": 0.0727, "step": 8070 }, { "epoch": 0.6451355343526688, "grad_norm": 0.44264474930118525, "learning_rate": 7.527392095107129e-05, "loss": 0.0725, "step": 8080 }, { "epoch": 0.645933969419937, "grad_norm": 0.8852677852993542, "learning_rate": 7.522956128288162e-05, "loss": 0.0706, "step": 8090 }, { "epoch": 0.646732404487205, "grad_norm": 0.4185552606938776, "learning_rate": 7.518520161469192e-05, "loss": 0.0615, "step": 8100 }, { "epoch": 0.6475308395544732, "grad_norm": 0.9889620142615653, "learning_rate": 7.514084194650224e-05, "loss": 0.066, "step": 8110 }, { "epoch": 0.6483292746217414, "grad_norm": 0.5444520967465741, "learning_rate": 7.509648227831257e-05, "loss": 0.0726, "step": 8120 }, { "epoch": 0.6491277096890096, "grad_norm": 0.6543439122201649, "learning_rate": 7.505212261012287e-05, "loss": 0.0709, "step": 8130 }, { "epoch": 0.6499261447562777, "grad_norm": 0.2836388415205917, "learning_rate": 7.50077629419332e-05, "loss": 0.0552, "step": 8140 }, { "epoch": 0.6507245798235458, "grad_norm": 0.6375614613426243, "learning_rate": 7.496340327374352e-05, "loss": 0.0694, "step": 8150 }, { "epoch": 0.651523014890814, "grad_norm": 1.1844406778857912, "learning_rate": 7.491904360555383e-05, "loss": 0.0612, "step": 8160 }, { "epoch": 0.6523214499580822, "grad_norm": 0.49923423023373387, "learning_rate": 7.487468393736416e-05, "loss": 0.0611, "step": 8170 }, { "epoch": 0.6531198850253503, "grad_norm": 0.28409047103533674, "learning_rate": 7.483032426917447e-05, "loss": 0.051, "step": 8180 }, { "epoch": 0.6539183200926185, "grad_norm": 0.2772068614340186, "learning_rate": 7.478596460098479e-05, "loss": 0.066, "step": 8190 }, { "epoch": 0.6547167551598866, "grad_norm": 0.15989122587835364, "learning_rate": 7.474160493279511e-05, "loss": 0.0413, "step": 8200 } ], "logging_steps": 10, "max_steps": 25048, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }