{ "best_metric": 4.381021022796631, "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/mix-bpe/checkpoint-3906", "epoch": 0.9998557456507313, "eval_steps": 434, "global_step": 4332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002308069588298087, "grad_norm": 1.293995976448059, "learning_rate": 1.4999999999999999e-05, "loss": 8.3864, "step": 10 }, { "epoch": 0.004616139176596174, "grad_norm": 1.6057629585266113, "learning_rate": 2.9999999999999997e-05, "loss": 8.3534, "step": 20 }, { "epoch": 0.006924208764894262, "grad_norm": 1.8857852220535278, "learning_rate": 4.4999999999999996e-05, "loss": 8.2432, "step": 30 }, { "epoch": 0.009232278353192349, "grad_norm": 1.6119285821914673, "learning_rate": 5.9999999999999995e-05, "loss": 8.0835, "step": 40 }, { "epoch": 0.011540347941490435, "grad_norm": 1.474076747894287, "learning_rate": 7.5e-05, "loss": 7.8521, "step": 50 }, { "epoch": 0.013848417529788524, "grad_norm": 1.3890337944030762, "learning_rate": 8.999999999999999e-05, "loss": 7.6704, "step": 60 }, { "epoch": 0.01615648711808661, "grad_norm": 1.7293298244476318, "learning_rate": 0.00010499999999999999, "loss": 7.4797, "step": 70 }, { "epoch": 0.018464556706384697, "grad_norm": 1.161170244216919, "learning_rate": 0.00011999999999999999, "loss": 7.2628, "step": 80 }, { "epoch": 0.020772626294682784, "grad_norm": 1.2024428844451904, "learning_rate": 0.000135, "loss": 7.0311, "step": 90 }, { "epoch": 0.02308069588298087, "grad_norm": 0.9382893443107605, "learning_rate": 0.00015, "loss": 6.79, "step": 100 }, { "epoch": 0.025388765471278957, "grad_norm": 0.9623205661773682, "learning_rate": 0.000165, "loss": 6.6329, "step": 110 }, { "epoch": 0.027696835059577048, "grad_norm": 0.5374640822410583, "learning_rate": 0.00017999999999999998, "loss": 6.4776, "step": 120 }, { "epoch": 0.030004904647875134, "grad_norm": 0.37330469489097595, "learning_rate": 0.000195, "loss": 6.4591, "step": 130 }, { "epoch": 0.03231297423617322, "grad_norm": 0.4222196340560913, "learning_rate": 0.00020999999999999998, "loss": 6.4463, "step": 140 }, { "epoch": 0.03462104382447131, "grad_norm": 0.4244931638240814, "learning_rate": 0.000225, "loss": 6.4109, "step": 150 }, { "epoch": 0.036929113412769395, "grad_norm": 0.5667627453804016, "learning_rate": 0.00023999999999999998, "loss": 6.3945, "step": 160 }, { "epoch": 0.03923718300106748, "grad_norm": 0.4752316474914551, "learning_rate": 0.00025499999999999996, "loss": 6.3878, "step": 170 }, { "epoch": 0.04154525258936557, "grad_norm": 0.5644646883010864, "learning_rate": 0.00027, "loss": 6.3093, "step": 180 }, { "epoch": 0.043853322177663655, "grad_norm": 0.6428855657577515, "learning_rate": 0.000285, "loss": 6.3548, "step": 190 }, { "epoch": 0.04616139176596174, "grad_norm": 0.8332350850105286, "learning_rate": 0.0003, "loss": 6.2497, "step": 200 }, { "epoch": 0.04846946135425983, "grad_norm": 0.8160709142684937, "learning_rate": 0.0002999956645089803, "loss": 6.1748, "step": 210 }, { "epoch": 0.050777530942557915, "grad_norm": 0.7665246725082397, "learning_rate": 0.000299982658286541, "loss": 6.1079, "step": 220 }, { "epoch": 0.05308560053085601, "grad_norm": 0.7429030537605286, "learning_rate": 0.00029996098208452687, "loss": 6.032, "step": 230 }, { "epoch": 0.055393670119154095, "grad_norm": 0.9278781414031982, "learning_rate": 0.0002999306371559644, "loss": 6.023, "step": 240 }, { "epoch": 0.05770173970745218, "grad_norm": 0.7202680706977844, "learning_rate": 0.00029989162525498905, "loss": 5.9386, "step": 250 }, { "epoch": 0.06000980929575027, "grad_norm": 0.7615482807159424, "learning_rate": 0.000299843948636744, "loss": 5.8888, "step": 260 }, { "epoch": 0.062317878884048356, "grad_norm": 0.7727493643760681, "learning_rate": 0.00029978761005725014, "loss": 5.8483, "step": 270 }, { "epoch": 0.06462594847234644, "grad_norm": 0.8060325980186462, "learning_rate": 0.0002997226127732461, "loss": 5.8168, "step": 280 }, { "epoch": 0.06693401806064453, "grad_norm": 0.801364541053772, "learning_rate": 0.0002996489605420004, "loss": 5.7915, "step": 290 }, { "epoch": 0.06924208764894262, "grad_norm": 0.6761994957923889, "learning_rate": 0.0002995666576210942, "loss": 5.816, "step": 300 }, { "epoch": 0.0715501572372407, "grad_norm": 0.854761004447937, "learning_rate": 0.0002994757087681753, "loss": 5.7237, "step": 310 }, { "epoch": 0.07385822682553879, "grad_norm": 0.810724675655365, "learning_rate": 0.0002993761192406826, "loss": 5.6849, "step": 320 }, { "epoch": 0.07616629641383688, "grad_norm": 0.7817623615264893, "learning_rate": 0.000299267894795543, "loss": 5.6347, "step": 330 }, { "epoch": 0.07847436600213496, "grad_norm": 0.7851743698120117, "learning_rate": 0.0002991510416888378, "loss": 5.5995, "step": 340 }, { "epoch": 0.08078243559043305, "grad_norm": 0.736893355846405, "learning_rate": 0.0002990255666754418, "loss": 5.6445, "step": 350 }, { "epoch": 0.08309050517873114, "grad_norm": 0.8936936855316162, "learning_rate": 0.00029889147700863205, "loss": 5.6018, "step": 360 }, { "epoch": 0.08539857476702922, "grad_norm": 0.852159857749939, "learning_rate": 0.00029874878043966926, "loss": 5.5471, "step": 370 }, { "epoch": 0.08770664435532731, "grad_norm": 0.7617043256759644, "learning_rate": 0.0002985974852173493, "loss": 5.5397, "step": 380 }, { "epoch": 0.0900147139436254, "grad_norm": 0.7751661539077759, "learning_rate": 0.0002984376000875267, "loss": 5.4445, "step": 390 }, { "epoch": 0.09232278353192348, "grad_norm": 0.8484746813774109, "learning_rate": 0.00029826913429260843, "loss": 5.4171, "step": 400 }, { "epoch": 0.09463085312022157, "grad_norm": 0.885892391204834, "learning_rate": 0.0002980920975710206, "loss": 5.4414, "step": 410 }, { "epoch": 0.09693892270851966, "grad_norm": 0.8213350176811218, "learning_rate": 0.0002979065001566447, "loss": 5.4237, "step": 420 }, { "epoch": 0.09924699229681774, "grad_norm": 0.839364767074585, "learning_rate": 0.00029771235277822633, "loss": 5.4203, "step": 430 }, { "epoch": 0.10017022013213699, "eval_loss": 5.607814311981201, "eval_runtime": 38.9963, "eval_samples_per_second": 641.087, "eval_steps_per_second": 80.136, "step": 434 }, { "epoch": 0.10155506188511583, "grad_norm": 0.8963534832000732, "learning_rate": 0.0002975096666587551, "loss": 5.3826, "step": 440 }, { "epoch": 0.10386313147341392, "grad_norm": 0.9185658097267151, "learning_rate": 0.0002972984535148157, "loss": 5.3868, "step": 450 }, { "epoch": 0.10617120106171202, "grad_norm": 0.9081022143363953, "learning_rate": 0.0002970787255559106, "loss": 5.4027, "step": 460 }, { "epoch": 0.1084792706500101, "grad_norm": 0.8252591490745544, "learning_rate": 0.00029685049548375426, "loss": 5.3417, "step": 470 }, { "epoch": 0.11078734023830819, "grad_norm": 0.8946505784988403, "learning_rate": 0.0002966137764915393, "loss": 5.2916, "step": 480 }, { "epoch": 0.11309540982660628, "grad_norm": 0.8276567459106445, "learning_rate": 0.00029636858226317304, "loss": 5.2734, "step": 490 }, { "epoch": 0.11540347941490436, "grad_norm": 0.8609122037887573, "learning_rate": 0.00029611492697248726, "loss": 5.3293, "step": 500 }, { "epoch": 0.11771154900320245, "grad_norm": 0.9473890662193298, "learning_rate": 0.0002958528252824184, "loss": 5.3, "step": 510 }, { "epoch": 0.12001961859150054, "grad_norm": 0.8542040586471558, "learning_rate": 0.0002955822923441601, "loss": 5.2721, "step": 520 }, { "epoch": 0.12232768817979862, "grad_norm": 0.8030778169631958, "learning_rate": 0.00029530334379628735, "loss": 5.2909, "step": 530 }, { "epoch": 0.12463575776809671, "grad_norm": 0.8569799065589905, "learning_rate": 0.0002950159957638525, "loss": 5.2491, "step": 540 }, { "epoch": 0.12694382735639478, "grad_norm": 0.8883563280105591, "learning_rate": 0.00029472026485745297, "loss": 5.1771, "step": 550 }, { "epoch": 0.12925189694469288, "grad_norm": 0.9277822971343994, "learning_rate": 0.00029441616817227145, "loss": 5.1982, "step": 560 }, { "epoch": 0.13155996653299096, "grad_norm": 0.888548731803894, "learning_rate": 0.0002941037232870871, "loss": 5.2259, "step": 570 }, { "epoch": 0.13386803612128906, "grad_norm": 0.8762586712837219, "learning_rate": 0.00029378294826325993, "loss": 5.2699, "step": 580 }, { "epoch": 0.13617610570958713, "grad_norm": 1.1397993564605713, "learning_rate": 0.0002934538616436863, "loss": 5.2092, "step": 590 }, { "epoch": 0.13848417529788523, "grad_norm": 0.8557673096656799, "learning_rate": 0.0002931164824517275, "loss": 5.2417, "step": 600 }, { "epoch": 0.14079224488618333, "grad_norm": 0.8277891874313354, "learning_rate": 0.00029277083019010945, "loss": 5.16, "step": 610 }, { "epoch": 0.1431003144744814, "grad_norm": 0.8815522193908691, "learning_rate": 0.00029241692483979593, "loss": 5.1709, "step": 620 }, { "epoch": 0.1454083840627795, "grad_norm": 0.9206939339637756, "learning_rate": 0.0002920547868588331, "loss": 5.1484, "step": 630 }, { "epoch": 0.14771645365107758, "grad_norm": 0.8649567365646362, "learning_rate": 0.00029168443718116725, "loss": 5.1292, "step": 640 }, { "epoch": 0.15002452323937568, "grad_norm": 0.8811323642730713, "learning_rate": 0.00029130589721543433, "loss": 5.1518, "step": 650 }, { "epoch": 0.15233259282767375, "grad_norm": 0.8579047918319702, "learning_rate": 0.0002909191888437227, "loss": 5.0781, "step": 660 }, { "epoch": 0.15464066241597185, "grad_norm": 0.9432843923568726, "learning_rate": 0.00029052433442030797, "loss": 5.1191, "step": 670 }, { "epoch": 0.15694873200426993, "grad_norm": 0.8163111805915833, "learning_rate": 0.00029012135677036077, "loss": 5.0546, "step": 680 }, { "epoch": 0.15925680159256803, "grad_norm": 0.8901731967926025, "learning_rate": 0.00028971027918862777, "loss": 5.0731, "step": 690 }, { "epoch": 0.1615648711808661, "grad_norm": 0.9626258015632629, "learning_rate": 0.00028929112543808435, "loss": 5.0357, "step": 700 }, { "epoch": 0.1638729407691642, "grad_norm": 0.898350179195404, "learning_rate": 0.0002888639197485614, "loss": 5.1027, "step": 710 }, { "epoch": 0.16618101035746227, "grad_norm": 0.9431145787239075, "learning_rate": 0.00028842868681534486, "loss": 4.9996, "step": 720 }, { "epoch": 0.16848907994576037, "grad_norm": 0.8631531596183777, "learning_rate": 0.0002879854517977475, "loss": 5.0532, "step": 730 }, { "epoch": 0.17079714953405845, "grad_norm": 0.9095075726509094, "learning_rate": 0.0002875342403176553, "loss": 5.0117, "step": 740 }, { "epoch": 0.17310521912235655, "grad_norm": 0.8849276304244995, "learning_rate": 0.00028707507845804575, "loss": 4.997, "step": 750 }, { "epoch": 0.17541328871065462, "grad_norm": 1.015526533126831, "learning_rate": 0.00028660799276148053, "loss": 5.017, "step": 760 }, { "epoch": 0.17772135829895272, "grad_norm": 0.9242818355560303, "learning_rate": 0.00028613301022857086, "loss": 5.0325, "step": 770 }, { "epoch": 0.1800294278872508, "grad_norm": 0.871295154094696, "learning_rate": 0.0002856501583164168, "loss": 5.0235, "step": 780 }, { "epoch": 0.1823374974755489, "grad_norm": 0.9293336868286133, "learning_rate": 0.0002851594649370201, "loss": 5.0311, "step": 790 }, { "epoch": 0.18464556706384697, "grad_norm": 0.8775876760482788, "learning_rate": 0.00028466095845567057, "loss": 5.0069, "step": 800 }, { "epoch": 0.18695363665214507, "grad_norm": 0.9968228340148926, "learning_rate": 0.0002841546676893065, "loss": 4.9595, "step": 810 }, { "epoch": 0.18926170624044314, "grad_norm": 0.9142509698867798, "learning_rate": 0.000283640621904849, "loss": 4.9601, "step": 820 }, { "epoch": 0.19156977582874124, "grad_norm": 0.863057017326355, "learning_rate": 0.0002831188508175096, "loss": 4.9811, "step": 830 }, { "epoch": 0.1938778454170393, "grad_norm": 0.8651947379112244, "learning_rate": 0.00028258938458907334, "loss": 4.9288, "step": 840 }, { "epoch": 0.1961859150053374, "grad_norm": 0.8541522026062012, "learning_rate": 0.0002820522538261545, "loss": 4.9037, "step": 850 }, { "epoch": 0.1984939845936355, "grad_norm": 0.9082189202308655, "learning_rate": 0.0002815074895784278, "loss": 4.8757, "step": 860 }, { "epoch": 0.20034044026427397, "eval_loss": 5.134793758392334, "eval_runtime": 39.0568, "eval_samples_per_second": 640.093, "eval_steps_per_second": 80.012, "step": 868 }, { "epoch": 0.2008020541819336, "grad_norm": 0.8799365162849426, "learning_rate": 0.0002809551233368332, "loss": 4.942, "step": 870 }, { "epoch": 0.20311012377023166, "grad_norm": 0.9927550554275513, "learning_rate": 0.00028039518703175577, "loss": 4.8919, "step": 880 }, { "epoch": 0.20541819335852976, "grad_norm": 0.9068810343742371, "learning_rate": 0.00027982771303117996, "loss": 4.9315, "step": 890 }, { "epoch": 0.20772626294682783, "grad_norm": 0.975292980670929, "learning_rate": 0.000279252734138818, "loss": 4.919, "step": 900 }, { "epoch": 0.21003433253512593, "grad_norm": 0.8984159231185913, "learning_rate": 0.0002786702835922144, "loss": 4.8989, "step": 910 }, { "epoch": 0.21234240212342403, "grad_norm": 0.9731834530830383, "learning_rate": 0.0002780803950608239, "loss": 4.8991, "step": 920 }, { "epoch": 0.2146504717117221, "grad_norm": 0.932558536529541, "learning_rate": 0.00027748310264406564, "loss": 4.8866, "step": 930 }, { "epoch": 0.2169585413000202, "grad_norm": 0.9369585514068604, "learning_rate": 0.00027687844086935176, "loss": 4.8829, "step": 940 }, { "epoch": 0.21926661088831828, "grad_norm": 0.9317886829376221, "learning_rate": 0.0002762664446900914, "loss": 4.8399, "step": 950 }, { "epoch": 0.22157468047661638, "grad_norm": 0.9692897796630859, "learning_rate": 0.00027564714948367046, "loss": 4.8339, "step": 960 }, { "epoch": 0.22388275006491445, "grad_norm": 0.912087082862854, "learning_rate": 0.0002750205910494064, "loss": 4.8176, "step": 970 }, { "epoch": 0.22619081965321255, "grad_norm": 0.9403560757637024, "learning_rate": 0.00027438680560647877, "loss": 4.8227, "step": 980 }, { "epoch": 0.22849888924151063, "grad_norm": 0.9240433573722839, "learning_rate": 0.0002737458297918355, "loss": 4.8053, "step": 990 }, { "epoch": 0.23080695882980873, "grad_norm": 0.9296719431877136, "learning_rate": 0.000273097700658075, "loss": 4.8571, "step": 1000 }, { "epoch": 0.2331150284181068, "grad_norm": 0.9312313199043274, "learning_rate": 0.0002724424556713046, "loss": 4.8177, "step": 1010 }, { "epoch": 0.2354230980064049, "grad_norm": 0.9134889245033264, "learning_rate": 0.0002717801327089743, "loss": 4.7824, "step": 1020 }, { "epoch": 0.23773116759470297, "grad_norm": 0.8830430507659912, "learning_rate": 0.0002711107700576875, "loss": 4.7996, "step": 1030 }, { "epoch": 0.24003923718300108, "grad_norm": 0.9481594562530518, "learning_rate": 0.00027043440641098777, "loss": 4.8118, "step": 1040 }, { "epoch": 0.24234730677129915, "grad_norm": 0.9204795956611633, "learning_rate": 0.0002697510808671219, "loss": 4.7847, "step": 1050 }, { "epoch": 0.24465537635959725, "grad_norm": 0.9537863731384277, "learning_rate": 0.0002690608329267801, "loss": 4.79, "step": 1060 }, { "epoch": 0.24696344594789532, "grad_norm": 0.9290711283683777, "learning_rate": 0.00026836370249081235, "loss": 4.7671, "step": 1070 }, { "epoch": 0.24927151553619342, "grad_norm": 0.9401620626449585, "learning_rate": 0.00026765972985792183, "loss": 4.7715, "step": 1080 }, { "epoch": 0.2515795851244915, "grad_norm": 0.9478754997253418, "learning_rate": 0.00026694895572233556, "loss": 4.8047, "step": 1090 }, { "epoch": 0.25388765471278957, "grad_norm": 0.9853310585021973, "learning_rate": 0.000266231421171452, "loss": 4.744, "step": 1100 }, { "epoch": 0.25619572430108767, "grad_norm": 0.9713465571403503, "learning_rate": 0.0002655071676834659, "loss": 4.7124, "step": 1110 }, { "epoch": 0.25850379388938577, "grad_norm": 0.9828191995620728, "learning_rate": 0.00026477623712497047, "loss": 4.7588, "step": 1120 }, { "epoch": 0.26081186347768387, "grad_norm": 0.9578918814659119, "learning_rate": 0.0002640386717485373, "loss": 4.7603, "step": 1130 }, { "epoch": 0.2631199330659819, "grad_norm": 0.9244778156280518, "learning_rate": 0.0002632945141902739, "loss": 4.7823, "step": 1140 }, { "epoch": 0.26542800265428, "grad_norm": 1.0172864198684692, "learning_rate": 0.00026254380746735926, "loss": 4.7744, "step": 1150 }, { "epoch": 0.2677360722425781, "grad_norm": 0.9416136741638184, "learning_rate": 0.00026178659497555663, "loss": 4.7059, "step": 1160 }, { "epoch": 0.2700441418308762, "grad_norm": 0.8836077451705933, "learning_rate": 0.0002610229204867055, "loss": 4.7969, "step": 1170 }, { "epoch": 0.27235221141917426, "grad_norm": 0.9215325713157654, "learning_rate": 0.000260252828146191, "loss": 4.7436, "step": 1180 }, { "epoch": 0.27466028100747236, "grad_norm": 0.9629151225090027, "learning_rate": 0.0002594763624703922, "loss": 4.7368, "step": 1190 }, { "epoch": 0.27696835059577046, "grad_norm": 1.0467829704284668, "learning_rate": 0.00025869356834410864, "loss": 4.6909, "step": 1200 }, { "epoch": 0.27927642018406856, "grad_norm": 0.9704943299293518, "learning_rate": 0.00025790449101796575, "loss": 4.6959, "step": 1210 }, { "epoch": 0.28158448977236666, "grad_norm": 0.9856391549110413, "learning_rate": 0.0002571091761057989, "loss": 4.6771, "step": 1220 }, { "epoch": 0.2838925593606647, "grad_norm": 1.040661096572876, "learning_rate": 0.00025630766958201695, "loss": 4.7199, "step": 1230 }, { "epoch": 0.2862006289489628, "grad_norm": 0.9736753702163696, "learning_rate": 0.0002555000177789444, "loss": 4.6799, "step": 1240 }, { "epoch": 0.2885086985372609, "grad_norm": 1.006039023399353, "learning_rate": 0.00025468626738414305, "loss": 4.694, "step": 1250 }, { "epoch": 0.290816768125559, "grad_norm": 0.96723473072052, "learning_rate": 0.0002538664654377134, "loss": 4.6445, "step": 1260 }, { "epoch": 0.29312483771385706, "grad_norm": 0.9628943800926208, "learning_rate": 0.00025304065932957494, "loss": 4.6465, "step": 1270 }, { "epoch": 0.29543290730215516, "grad_norm": 0.9398277997970581, "learning_rate": 0.00025220889679672745, "loss": 4.6037, "step": 1280 }, { "epoch": 0.29774097689045326, "grad_norm": 0.9146639108657837, "learning_rate": 0.00025137122592049066, "loss": 4.6605, "step": 1290 }, { "epoch": 0.30004904647875136, "grad_norm": 0.9692845940589905, "learning_rate": 0.0002505276951237254, "loss": 4.6331, "step": 1300 }, { "epoch": 0.30051066039641094, "eval_loss": 4.868031978607178, "eval_runtime": 39.4278, "eval_samples_per_second": 634.071, "eval_steps_per_second": 79.259, "step": 1302 }, { "epoch": 0.3023571160670494, "grad_norm": 0.9402655959129333, "learning_rate": 0.00024967835316803434, "loss": 4.6622, "step": 1310 }, { "epoch": 0.3046651856553475, "grad_norm": 0.9427255392074585, "learning_rate": 0.00024882324915094305, "loss": 4.6359, "step": 1320 }, { "epoch": 0.3069732552436456, "grad_norm": 0.9967660307884216, "learning_rate": 0.00024796243250306196, "loss": 4.6153, "step": 1330 }, { "epoch": 0.3092813248319437, "grad_norm": 0.9784890413284302, "learning_rate": 0.00024709595298522916, "loss": 4.6401, "step": 1340 }, { "epoch": 0.31158939442024175, "grad_norm": 1.011221170425415, "learning_rate": 0.00024622386068563344, "loss": 4.5711, "step": 1350 }, { "epoch": 0.31389746400853985, "grad_norm": 1.0162895917892456, "learning_rate": 0.0002453462060169193, "loss": 4.6102, "step": 1360 }, { "epoch": 0.31620553359683795, "grad_norm": 0.9559857845306396, "learning_rate": 0.00024446303971327254, "loss": 4.6215, "step": 1370 }, { "epoch": 0.31851360318513605, "grad_norm": 0.9640511870384216, "learning_rate": 0.00024357441282748756, "loss": 4.6299, "step": 1380 }, { "epoch": 0.3208216727734341, "grad_norm": 0.990332841873169, "learning_rate": 0.00024268037672801605, "loss": 4.6633, "step": 1390 }, { "epoch": 0.3231297423617322, "grad_norm": 0.971305787563324, "learning_rate": 0.00024178098309599782, "loss": 4.6453, "step": 1400 }, { "epoch": 0.3254378119500303, "grad_norm": 0.9675345420837402, "learning_rate": 0.00024087628392227304, "loss": 4.562, "step": 1410 }, { "epoch": 0.3277458815383284, "grad_norm": 0.9208952188491821, "learning_rate": 0.000239966331504377, "loss": 4.576, "step": 1420 }, { "epoch": 0.33005395112662644, "grad_norm": 1.004249930381775, "learning_rate": 0.00023905117844351674, "loss": 4.6263, "step": 1430 }, { "epoch": 0.33236202071492454, "grad_norm": 0.9654126763343811, "learning_rate": 0.0002381308776415307, "loss": 4.6431, "step": 1440 }, { "epoch": 0.33467009030322264, "grad_norm": 0.9294122457504272, "learning_rate": 0.0002372054822978304, "loss": 4.5542, "step": 1450 }, { "epoch": 0.33697815989152075, "grad_norm": 0.8988949060440063, "learning_rate": 0.00023627504590632517, "loss": 4.6312, "step": 1460 }, { "epoch": 0.3392862294798188, "grad_norm": 1.0338096618652344, "learning_rate": 0.00023533962225232992, "loss": 4.5963, "step": 1470 }, { "epoch": 0.3415942990681169, "grad_norm": 0.9440902471542358, "learning_rate": 0.00023439926540945604, "loss": 4.5587, "step": 1480 }, { "epoch": 0.343902368656415, "grad_norm": 1.0818684101104736, "learning_rate": 0.00023345402973648548, "loss": 4.5462, "step": 1490 }, { "epoch": 0.3462104382447131, "grad_norm": 0.9467376470565796, "learning_rate": 0.00023250396987422857, "loss": 4.5969, "step": 1500 }, { "epoch": 0.3485185078330112, "grad_norm": 1.009598970413208, "learning_rate": 0.00023154914074236522, "loss": 4.5773, "step": 1510 }, { "epoch": 0.35082657742130924, "grad_norm": 1.053017020225525, "learning_rate": 0.00023058959753627056, "loss": 4.6051, "step": 1520 }, { "epoch": 0.35313464700960734, "grad_norm": 0.969485342502594, "learning_rate": 0.0002296253957238239, "loss": 4.604, "step": 1530 }, { "epoch": 0.35544271659790544, "grad_norm": 1.2008713483810425, "learning_rate": 0.00022865659104220255, "loss": 4.5704, "step": 1540 }, { "epoch": 0.35775078618620354, "grad_norm": 1.0302430391311646, "learning_rate": 0.00022768323949465987, "loss": 4.5391, "step": 1550 }, { "epoch": 0.3600588557745016, "grad_norm": 0.966468870639801, "learning_rate": 0.0002267053973472877, "loss": 4.5363, "step": 1560 }, { "epoch": 0.3623669253627997, "grad_norm": 0.9762535095214844, "learning_rate": 0.00022572312112576406, "loss": 4.5587, "step": 1570 }, { "epoch": 0.3646749949510978, "grad_norm": 1.0306516885757446, "learning_rate": 0.0002247364676120855, "loss": 4.5296, "step": 1580 }, { "epoch": 0.3669830645393959, "grad_norm": 0.9838928580284119, "learning_rate": 0.00022374549384128456, "loss": 4.5947, "step": 1590 }, { "epoch": 0.36929113412769393, "grad_norm": 0.9782153964042664, "learning_rate": 0.0002227502570981331, "loss": 4.5091, "step": 1600 }, { "epoch": 0.37159920371599203, "grad_norm": 1.0444267988204956, "learning_rate": 0.00022175081491383048, "loss": 4.5221, "step": 1610 }, { "epoch": 0.37390727330429013, "grad_norm": 1.0300132036209106, "learning_rate": 0.00022074722506267846, "loss": 4.5312, "step": 1620 }, { "epoch": 0.37621534289258823, "grad_norm": 1.0193167924880981, "learning_rate": 0.00021973954555874067, "loss": 4.5706, "step": 1630 }, { "epoch": 0.3785234124808863, "grad_norm": 1.005703091621399, "learning_rate": 0.00021872783465248978, "loss": 4.5149, "step": 1640 }, { "epoch": 0.3808314820691844, "grad_norm": 1.0483267307281494, "learning_rate": 0.00021771215082743968, "loss": 4.4801, "step": 1650 }, { "epoch": 0.3831395516574825, "grad_norm": 0.9708661437034607, "learning_rate": 0.00021669255279676514, "loss": 4.5121, "step": 1660 }, { "epoch": 0.3854476212457806, "grad_norm": 1.0603435039520264, "learning_rate": 0.00021566909949990746, "loss": 4.4826, "step": 1670 }, { "epoch": 0.3877556908340786, "grad_norm": 1.1404428482055664, "learning_rate": 0.0002146418500991678, "loss": 4.4787, "step": 1680 }, { "epoch": 0.3900637604223767, "grad_norm": 1.0174285173416138, "learning_rate": 0.00021361086397628682, "loss": 4.5004, "step": 1690 }, { "epoch": 0.3923718300106748, "grad_norm": 1.1664884090423584, "learning_rate": 0.0002125762007290121, "loss": 4.5197, "step": 1700 }, { "epoch": 0.3946798995989729, "grad_norm": 0.9837216138839722, "learning_rate": 0.00021153792016765334, "loss": 4.5019, "step": 1710 }, { "epoch": 0.396987969187271, "grad_norm": 1.0171154737472534, "learning_rate": 0.00021049608231162454, "loss": 4.514, "step": 1720 }, { "epoch": 0.3992960387755691, "grad_norm": 1.0302062034606934, "learning_rate": 0.00020945074738597447, "loss": 4.5388, "step": 1730 }, { "epoch": 0.40068088052854794, "eval_loss": 4.705195426940918, "eval_runtime": 39.444, "eval_samples_per_second": 633.81, "eval_steps_per_second": 79.226, "step": 1736 }, { "epoch": 0.4016041083638672, "grad_norm": 1.0344492197036743, "learning_rate": 0.00020840197581790569, "loss": 4.454, "step": 1740 }, { "epoch": 0.4039121779521653, "grad_norm": 1.0174837112426758, "learning_rate": 0.00020734982823328104, "loss": 4.4651, "step": 1750 }, { "epoch": 0.4062202475404633, "grad_norm": 0.9471856355667114, "learning_rate": 0.00020629436545311928, "loss": 4.5174, "step": 1760 }, { "epoch": 0.4085283171287614, "grad_norm": 1.0585298538208008, "learning_rate": 0.00020523564849007906, "loss": 4.4544, "step": 1770 }, { "epoch": 0.4108363867170595, "grad_norm": 1.0396827459335327, "learning_rate": 0.00020417373854493228, "loss": 4.5077, "step": 1780 }, { "epoch": 0.4131444563053576, "grad_norm": 1.0221539735794067, "learning_rate": 0.0002031086970030259, "loss": 4.4515, "step": 1790 }, { "epoch": 0.41545252589365567, "grad_norm": 0.9722737669944763, "learning_rate": 0.00020204058543073393, "loss": 4.4483, "step": 1800 }, { "epoch": 0.41776059548195377, "grad_norm": 1.0386008024215698, "learning_rate": 0.00020096946557189802, "loss": 4.5063, "step": 1810 }, { "epoch": 0.42006866507025187, "grad_norm": 1.1091365814208984, "learning_rate": 0.00019989539934425857, "loss": 4.4913, "step": 1820 }, { "epoch": 0.42237673465854997, "grad_norm": 1.083609700202942, "learning_rate": 0.0001988184488358754, "loss": 4.4873, "step": 1830 }, { "epoch": 0.42468480424684807, "grad_norm": 1.084915280342102, "learning_rate": 0.00019773867630153857, "loss": 4.4625, "step": 1840 }, { "epoch": 0.4269928738351461, "grad_norm": 0.9942842125892639, "learning_rate": 0.00019665614415916979, "loss": 4.435, "step": 1850 }, { "epoch": 0.4293009434234442, "grad_norm": 1.0121150016784668, "learning_rate": 0.00019557091498621416, "loss": 4.4056, "step": 1860 }, { "epoch": 0.4316090130117423, "grad_norm": 1.0252583026885986, "learning_rate": 0.00019448305151602272, "loss": 4.3947, "step": 1870 }, { "epoch": 0.4339170826000404, "grad_norm": 1.09006667137146, "learning_rate": 0.00019339261663422629, "loss": 4.4671, "step": 1880 }, { "epoch": 0.43622515218833846, "grad_norm": 1.0912604331970215, "learning_rate": 0.00019229967337510003, "loss": 4.3903, "step": 1890 }, { "epoch": 0.43853322177663656, "grad_norm": 1.0326118469238281, "learning_rate": 0.00019120428491791974, "loss": 4.4382, "step": 1900 }, { "epoch": 0.44084129136493466, "grad_norm": 1.0649503469467163, "learning_rate": 0.00019010651458330964, "loss": 4.3955, "step": 1910 }, { "epoch": 0.44314936095323276, "grad_norm": 1.0922999382019043, "learning_rate": 0.00018900642582958213, "loss": 4.4406, "step": 1920 }, { "epoch": 0.4454574305415308, "grad_norm": 1.0487009286880493, "learning_rate": 0.0001879040822490693, "loss": 4.4296, "step": 1930 }, { "epoch": 0.4477655001298289, "grad_norm": 0.9958457946777344, "learning_rate": 0.00018679954756444723, "loss": 4.46, "step": 1940 }, { "epoch": 0.450073569718127, "grad_norm": 1.0039042234420776, "learning_rate": 0.00018569288562505183, "loss": 4.4473, "step": 1950 }, { "epoch": 0.4523816393064251, "grad_norm": 1.1141057014465332, "learning_rate": 0.00018458416040318857, "loss": 4.4023, "step": 1960 }, { "epoch": 0.45468970889472315, "grad_norm": 1.052263855934143, "learning_rate": 0.00018347343599043388, "loss": 4.4455, "step": 1970 }, { "epoch": 0.45699777848302126, "grad_norm": 0.942537784576416, "learning_rate": 0.00018236077659393077, "loss": 4.466, "step": 1980 }, { "epoch": 0.45930584807131936, "grad_norm": 1.0139743089675903, "learning_rate": 0.00018124624653267682, "loss": 4.4551, "step": 1990 }, { "epoch": 0.46161391765961746, "grad_norm": 1.0442676544189453, "learning_rate": 0.0001801299102338063, "loss": 4.4184, "step": 2000 }, { "epoch": 0.4639219872479155, "grad_norm": 1.1058298349380493, "learning_rate": 0.00017901183222886592, "loss": 4.4478, "step": 2010 }, { "epoch": 0.4662300568362136, "grad_norm": 1.1228169202804565, "learning_rate": 0.00017789207715008428, "loss": 4.3777, "step": 2020 }, { "epoch": 0.4685381264245117, "grad_norm": 1.022026777267456, "learning_rate": 0.0001767707097266359, "loss": 4.4119, "step": 2030 }, { "epoch": 0.4708461960128098, "grad_norm": 1.0868556499481201, "learning_rate": 0.0001756477947808994, "loss": 4.3989, "step": 2040 }, { "epoch": 0.47315426560110785, "grad_norm": 1.0714977979660034, "learning_rate": 0.00017452339722471026, "loss": 4.4166, "step": 2050 }, { "epoch": 0.47546233518940595, "grad_norm": 1.0716434717178345, "learning_rate": 0.0001733975820556086, "loss": 4.3757, "step": 2060 }, { "epoch": 0.47777040477770405, "grad_norm": 1.0657131671905518, "learning_rate": 0.00017227041435308177, "loss": 4.3756, "step": 2070 }, { "epoch": 0.48007847436600215, "grad_norm": 1.0671368837356567, "learning_rate": 0.00017114195927480256, "loss": 4.3956, "step": 2080 }, { "epoch": 0.4823865439543002, "grad_norm": 1.0677906274795532, "learning_rate": 0.00017001228205286236, "loss": 4.3989, "step": 2090 }, { "epoch": 0.4846946135425983, "grad_norm": 1.0088515281677246, "learning_rate": 0.00016888144799000047, "loss": 4.4024, "step": 2100 }, { "epoch": 0.4870026831308964, "grad_norm": 0.9815865755081177, "learning_rate": 0.0001677495224558293, "loss": 4.4096, "step": 2110 }, { "epoch": 0.4893107527191945, "grad_norm": 1.1238495111465454, "learning_rate": 0.00016661657088305526, "loss": 4.3879, "step": 2120 }, { "epoch": 0.4916188223074926, "grad_norm": 1.0731137990951538, "learning_rate": 0.0001654826587636967, "loss": 4.4087, "step": 2130 }, { "epoch": 0.49392689189579064, "grad_norm": 1.1006789207458496, "learning_rate": 0.0001643478516452977, "loss": 4.3862, "step": 2140 }, { "epoch": 0.49623496148408874, "grad_norm": 1.0866320133209229, "learning_rate": 0.00016321221512713928, "loss": 4.3835, "step": 2150 }, { "epoch": 0.49854303107238684, "grad_norm": 1.1201952695846558, "learning_rate": 0.00016207581485644707, "loss": 4.3263, "step": 2160 }, { "epoch": 0.500851100660685, "grad_norm": 1.0500922203063965, "learning_rate": 0.0001609387165245966, "loss": 4.3604, "step": 2170 }, { "epoch": 0.500851100660685, "eval_loss": 4.576458930969238, "eval_runtime": 39.2462, "eval_samples_per_second": 637.005, "eval_steps_per_second": 79.626, "step": 2170 }, { "epoch": 0.503159170248983, "grad_norm": 1.1081622838974, "learning_rate": 0.0001598009858633161, "loss": 4.3842, "step": 2180 }, { "epoch": 0.5054672398372811, "grad_norm": 1.0299861431121826, "learning_rate": 0.00015866268864088626, "loss": 4.435, "step": 2190 }, { "epoch": 0.5077753094255791, "grad_norm": 1.0571188926696777, "learning_rate": 0.00015752389065833898, "loss": 4.3528, "step": 2200 }, { "epoch": 0.5100833790138772, "grad_norm": 1.1348025798797607, "learning_rate": 0.0001563846577456533, "loss": 4.3108, "step": 2210 }, { "epoch": 0.5123914486021753, "grad_norm": 1.0860552787780762, "learning_rate": 0.00015524505575794997, "loss": 4.3618, "step": 2220 }, { "epoch": 0.5146995181904734, "grad_norm": 1.085950493812561, "learning_rate": 0.0001541051505716849, "loss": 4.3482, "step": 2230 }, { "epoch": 0.5170075877787715, "grad_norm": 1.0809770822525024, "learning_rate": 0.00015296500808084055, "loss": 4.3486, "step": 2240 }, { "epoch": 0.5193156573670696, "grad_norm": 1.1132404804229736, "learning_rate": 0.00015182469419311754, "loss": 4.311, "step": 2250 }, { "epoch": 0.5216237269553677, "grad_norm": 1.073387861251831, "learning_rate": 0.00015068427482612393, "loss": 4.3567, "step": 2260 }, { "epoch": 0.5239317965436658, "grad_norm": 1.0589022636413574, "learning_rate": 0.0001495438159035655, "loss": 4.3789, "step": 2270 }, { "epoch": 0.5262398661319638, "grad_norm": 1.082677960395813, "learning_rate": 0.00014840338335143452, "loss": 4.3612, "step": 2280 }, { "epoch": 0.5285479357202619, "grad_norm": 1.0973902940750122, "learning_rate": 0.0001472630430941987, "loss": 4.3735, "step": 2290 }, { "epoch": 0.53085600530856, "grad_norm": 1.1433314085006714, "learning_rate": 0.00014612286105099068, "loss": 4.3271, "step": 2300 }, { "epoch": 0.5331640748968581, "grad_norm": 1.0663039684295654, "learning_rate": 0.00014498290313179725, "loss": 4.3353, "step": 2310 }, { "epoch": 0.5354721444851562, "grad_norm": 1.1327035427093506, "learning_rate": 0.00014384323523364948, "loss": 4.3811, "step": 2320 }, { "epoch": 0.5377802140734543, "grad_norm": 1.102273941040039, "learning_rate": 0.00014270392323681303, "loss": 4.3814, "step": 2330 }, { "epoch": 0.5400882836617524, "grad_norm": 1.1237622499465942, "learning_rate": 0.00014156503300098038, "loss": 4.34, "step": 2340 }, { "epoch": 0.5423963532500505, "grad_norm": 1.1374157667160034, "learning_rate": 0.00014042663036146344, "loss": 4.3239, "step": 2350 }, { "epoch": 0.5447044228383485, "grad_norm": 1.171049952507019, "learning_rate": 0.0001392887811253878, "loss": 4.3335, "step": 2360 }, { "epoch": 0.5470124924266466, "grad_norm": 1.128770112991333, "learning_rate": 0.00013815155106788865, "loss": 4.3021, "step": 2370 }, { "epoch": 0.5493205620149447, "grad_norm": 1.1492712497711182, "learning_rate": 0.00013701500592830878, "loss": 4.3139, "step": 2380 }, { "epoch": 0.5516286316032428, "grad_norm": 1.1176902055740356, "learning_rate": 0.00013587921140639805, "loss": 4.3339, "step": 2390 }, { "epoch": 0.5539367011915409, "grad_norm": 1.106078863143921, "learning_rate": 0.00013474423315851586, "loss": 4.314, "step": 2400 }, { "epoch": 0.556244770779839, "grad_norm": 1.0579196214675903, "learning_rate": 0.00013361013679383553, "loss": 4.2973, "step": 2410 }, { "epoch": 0.5585528403681371, "grad_norm": 1.0957796573638916, "learning_rate": 0.0001324769878705518, "loss": 4.2925, "step": 2420 }, { "epoch": 0.5608609099564352, "grad_norm": 1.1303716897964478, "learning_rate": 0.000131344851892091, "loss": 4.338, "step": 2430 }, { "epoch": 0.5631689795447333, "grad_norm": 1.1496975421905518, "learning_rate": 0.0001302137943033249, "loss": 4.3075, "step": 2440 }, { "epoch": 0.5654770491330313, "grad_norm": 1.1256930828094482, "learning_rate": 0.00012908388048678686, "loss": 4.3234, "step": 2450 }, { "epoch": 0.5677851187213294, "grad_norm": 1.1006488800048828, "learning_rate": 0.00012795517575889303, "loss": 4.311, "step": 2460 }, { "epoch": 0.5700931883096275, "grad_norm": 1.223235845565796, "learning_rate": 0.00012682774536616623, "loss": 4.3056, "step": 2470 }, { "epoch": 0.5724012578979256, "grad_norm": 1.2225327491760254, "learning_rate": 0.00012570165448146447, "loss": 4.3276, "step": 2480 }, { "epoch": 0.5747093274862237, "grad_norm": 1.152992844581604, "learning_rate": 0.00012457696820021314, "loss": 4.3058, "step": 2490 }, { "epoch": 0.5770173970745218, "grad_norm": 1.252081036567688, "learning_rate": 0.00012345375153664264, "loss": 4.2789, "step": 2500 }, { "epoch": 0.5793254666628199, "grad_norm": 1.1459189653396606, "learning_rate": 0.0001223320694200297, "loss": 4.3181, "step": 2510 }, { "epoch": 0.581633536251118, "grad_norm": 1.0899156332015991, "learning_rate": 0.00012121198669094436, "loss": 4.3692, "step": 2520 }, { "epoch": 0.583941605839416, "grad_norm": 1.060966968536377, "learning_rate": 0.00012009356809750131, "loss": 4.3294, "step": 2530 }, { "epoch": 0.5862496754277141, "grad_norm": 1.1345113515853882, "learning_rate": 0.0001189768782916175, "loss": 4.3261, "step": 2540 }, { "epoch": 0.5885577450160122, "grad_norm": 1.1231807470321655, "learning_rate": 0.00011786198182527461, "loss": 4.3368, "step": 2550 }, { "epoch": 0.5908658146043103, "grad_norm": 1.0552847385406494, "learning_rate": 0.00011674894314678761, "loss": 4.2938, "step": 2560 }, { "epoch": 0.5931738841926084, "grad_norm": 1.1445595026016235, "learning_rate": 0.00011563782659707897, "loss": 4.3184, "step": 2570 }, { "epoch": 0.5954819537809065, "grad_norm": 1.1536098718643188, "learning_rate": 0.00011452869640595975, "loss": 4.3189, "step": 2580 }, { "epoch": 0.5977900233692046, "grad_norm": 1.2136541604995728, "learning_rate": 0.00011342161668841641, "loss": 4.2195, "step": 2590 }, { "epoch": 0.6000980929575027, "grad_norm": 1.1163119077682495, "learning_rate": 0.00011231665144090456, "loss": 4.2419, "step": 2600 }, { "epoch": 0.6010213207928219, "eval_loss": 4.489214897155762, "eval_runtime": 39.3697, "eval_samples_per_second": 635.006, "eval_steps_per_second": 79.376, "step": 2604 }, { "epoch": 0.6024061625458007, "grad_norm": 1.4087986946105957, "learning_rate": 0.0001112138645376496, "loss": 4.2601, "step": 2610 }, { "epoch": 0.6047142321340988, "grad_norm": 1.19622802734375, "learning_rate": 0.00011011331972695449, "loss": 4.296, "step": 2620 }, { "epoch": 0.6070223017223969, "grad_norm": 1.1068109273910522, "learning_rate": 0.00010901508062751438, "loss": 4.2879, "step": 2630 }, { "epoch": 0.609330371310695, "grad_norm": 1.156851887702942, "learning_rate": 0.00010791921072473941, "loss": 4.2653, "step": 2640 }, { "epoch": 0.6116384408989931, "grad_norm": 1.1451683044433594, "learning_rate": 0.00010682577336708449, "loss": 4.2987, "step": 2650 }, { "epoch": 0.6139465104872912, "grad_norm": 1.133888840675354, "learning_rate": 0.00010573483176238752, "loss": 4.2558, "step": 2660 }, { "epoch": 0.6162545800755893, "grad_norm": 1.1750303506851196, "learning_rate": 0.00010464644897421561, "loss": 4.3379, "step": 2670 }, { "epoch": 0.6185626496638874, "grad_norm": 1.1152632236480713, "learning_rate": 0.00010356068791821953, "loss": 4.2346, "step": 2680 }, { "epoch": 0.6208707192521854, "grad_norm": 1.167487382888794, "learning_rate": 0.0001024776113584966, "loss": 4.2805, "step": 2690 }, { "epoch": 0.6231787888404835, "grad_norm": 1.1430394649505615, "learning_rate": 0.00010139728190396288, "loss": 4.2433, "step": 2700 }, { "epoch": 0.6254868584287816, "grad_norm": 1.181851863861084, "learning_rate": 0.00010031976200473364, "loss": 4.2759, "step": 2710 }, { "epoch": 0.6277949280170797, "grad_norm": 1.152464509010315, "learning_rate": 9.92451139485136e-05, "loss": 4.2761, "step": 2720 }, { "epoch": 0.6301029976053778, "grad_norm": 1.1081931591033936, "learning_rate": 9.817339985699593e-05, "loss": 4.2457, "step": 2730 }, { "epoch": 0.6324110671936759, "grad_norm": 1.1744070053100586, "learning_rate": 9.710468168227158e-05, "loss": 4.2863, "step": 2740 }, { "epoch": 0.634719136781974, "grad_norm": 1.218344807624817, "learning_rate": 9.60390212032479e-05, "loss": 4.2711, "step": 2750 }, { "epoch": 0.6370272063702721, "grad_norm": 1.1617342233657837, "learning_rate": 9.497648002207745e-05, "loss": 4.2289, "step": 2760 }, { "epoch": 0.6393352759585702, "grad_norm": 1.1277827024459839, "learning_rate": 9.391711956059675e-05, "loss": 4.2894, "step": 2770 }, { "epoch": 0.6416433455468682, "grad_norm": 1.2645858526229858, "learning_rate": 9.286100105677608e-05, "loss": 4.2934, "step": 2780 }, { "epoch": 0.6439514151351663, "grad_norm": 1.1396396160125732, "learning_rate": 9.180818556117931e-05, "loss": 4.2627, "step": 2790 }, { "epoch": 0.6462594847234644, "grad_norm": 1.1112399101257324, "learning_rate": 9.075873393343487e-05, "loss": 4.2799, "step": 2800 }, { "epoch": 0.6485675543117625, "grad_norm": 1.1682491302490234, "learning_rate": 8.971270683871736e-05, "loss": 4.2557, "step": 2810 }, { "epoch": 0.6508756239000606, "grad_norm": 1.1381292343139648, "learning_rate": 8.867016474424121e-05, "loss": 4.2468, "step": 2820 }, { "epoch": 0.6531836934883587, "grad_norm": 1.2451766729354858, "learning_rate": 8.763116791576497e-05, "loss": 4.3402, "step": 2830 }, { "epoch": 0.6554917630766568, "grad_norm": 1.2092978954315186, "learning_rate": 8.659577641410756e-05, "loss": 4.2999, "step": 2840 }, { "epoch": 0.6577998326649549, "grad_norm": 1.1491693258285522, "learning_rate": 8.556405009167627e-05, "loss": 4.2427, "step": 2850 }, { "epoch": 0.6601079022532529, "grad_norm": 1.166319489479065, "learning_rate": 8.453604858900736e-05, "loss": 4.2599, "step": 2860 }, { "epoch": 0.662415971841551, "grad_norm": 1.1869049072265625, "learning_rate": 8.351183133131778e-05, "loss": 4.2849, "step": 2870 }, { "epoch": 0.6647240414298491, "grad_norm": 1.1337544918060303, "learning_rate": 8.24914575250707e-05, "loss": 4.2321, "step": 2880 }, { "epoch": 0.6670321110181472, "grad_norm": 1.1453369855880737, "learning_rate": 8.147498615455221e-05, "loss": 4.2508, "step": 2890 }, { "epoch": 0.6693401806064453, "grad_norm": 1.2037177085876465, "learning_rate": 8.046247597846244e-05, "loss": 4.2616, "step": 2900 }, { "epoch": 0.6716482501947434, "grad_norm": 1.2716485261917114, "learning_rate": 7.945398552651837e-05, "loss": 4.2711, "step": 2910 }, { "epoch": 0.6739563197830415, "grad_norm": 1.1850802898406982, "learning_rate": 7.844957309607061e-05, "loss": 4.254, "step": 2920 }, { "epoch": 0.6762643893713396, "grad_norm": 1.1652292013168335, "learning_rate": 7.744929674873344e-05, "loss": 4.2528, "step": 2930 }, { "epoch": 0.6785724589596376, "grad_norm": 1.1236425638198853, "learning_rate": 7.645321430702854e-05, "loss": 4.2309, "step": 2940 }, { "epoch": 0.6808805285479357, "grad_norm": 1.1567282676696777, "learning_rate": 7.546138335104229e-05, "loss": 4.2226, "step": 2950 }, { "epoch": 0.6831885981362338, "grad_norm": 1.2129831314086914, "learning_rate": 7.447386121509741e-05, "loss": 4.2682, "step": 2960 }, { "epoch": 0.6854966677245319, "grad_norm": 1.1564743518829346, "learning_rate": 7.349070498443857e-05, "loss": 4.2495, "step": 2970 }, { "epoch": 0.68780473731283, "grad_norm": 1.230202317237854, "learning_rate": 7.251197149193251e-05, "loss": 4.2339, "step": 2980 }, { "epoch": 0.6901128069011281, "grad_norm": 1.1715277433395386, "learning_rate": 7.153771731478289e-05, "loss": 4.2226, "step": 2990 }, { "epoch": 0.6924208764894262, "grad_norm": 1.2347677946090698, "learning_rate": 7.05679987712595e-05, "loss": 4.227, "step": 3000 }, { "epoch": 0.6947289460777243, "grad_norm": 1.19216787815094, "learning_rate": 6.96028719174428e-05, "loss": 4.2868, "step": 3010 }, { "epoch": 0.6970370156660224, "grad_norm": 1.2830464839935303, "learning_rate": 6.864239254398352e-05, "loss": 4.2326, "step": 3020 }, { "epoch": 0.6993450852543204, "grad_norm": 1.2899502515792847, "learning_rate": 6.76866161728778e-05, "loss": 4.2616, "step": 3030 }, { "epoch": 0.7011915409249588, "eval_loss": 4.435102462768555, "eval_runtime": 39.3746, "eval_samples_per_second": 634.927, "eval_steps_per_second": 79.366, "step": 3038 }, { "epoch": 0.7016531548426185, "grad_norm": 1.1735284328460693, "learning_rate": 6.67355980542571e-05, "loss": 4.2741, "step": 3040 }, { "epoch": 0.7039612244309166, "grad_norm": 1.2111891508102417, "learning_rate": 6.578939316319502e-05, "loss": 4.2271, "step": 3050 }, { "epoch": 0.7062692940192147, "grad_norm": 1.188081979751587, "learning_rate": 6.484805619652893e-05, "loss": 4.2188, "step": 3060 }, { "epoch": 0.7085773636075128, "grad_norm": 1.285130500793457, "learning_rate": 6.391164156969856e-05, "loss": 4.193, "step": 3070 }, { "epoch": 0.7108854331958109, "grad_norm": 1.1954678297042847, "learning_rate": 6.298020341359972e-05, "loss": 4.211, "step": 3080 }, { "epoch": 0.713193502784109, "grad_norm": 1.1639289855957031, "learning_rate": 6.205379557145607e-05, "loss": 4.2421, "step": 3090 }, { "epoch": 0.7155015723724071, "grad_norm": 1.1945711374282837, "learning_rate": 6.113247159570591e-05, "loss": 4.2843, "step": 3100 }, { "epoch": 0.7178096419607051, "grad_norm": 1.2462584972381592, "learning_rate": 6.0216284744907036e-05, "loss": 4.2239, "step": 3110 }, { "epoch": 0.7201177115490032, "grad_norm": 1.2085964679718018, "learning_rate": 5.930528798065741e-05, "loss": 4.2027, "step": 3120 }, { "epoch": 0.7224257811373013, "grad_norm": 1.1740282773971558, "learning_rate": 5.839953396453442e-05, "loss": 4.2056, "step": 3130 }, { "epoch": 0.7247338507255994, "grad_norm": 1.1846504211425781, "learning_rate": 5.749907505504999e-05, "loss": 4.2292, "step": 3140 }, { "epoch": 0.7270419203138975, "grad_norm": 1.3088330030441284, "learning_rate": 5.660396330462448e-05, "loss": 4.2503, "step": 3150 }, { "epoch": 0.7293499899021956, "grad_norm": 1.2662092447280884, "learning_rate": 5.571425045657711e-05, "loss": 4.2536, "step": 3160 }, { "epoch": 0.7316580594904937, "grad_norm": 1.1370505094528198, "learning_rate": 5.4829987942135495e-05, "loss": 4.1551, "step": 3170 }, { "epoch": 0.7339661290787918, "grad_norm": 1.244957447052002, "learning_rate": 5.395122687746217e-05, "loss": 4.2295, "step": 3180 }, { "epoch": 0.7362741986670898, "grad_norm": 1.2070027589797974, "learning_rate": 5.3078018060699836e-05, "loss": 4.2056, "step": 3190 }, { "epoch": 0.7385822682553879, "grad_norm": 1.2402708530426025, "learning_rate": 5.221041196903489e-05, "loss": 4.243, "step": 3200 }, { "epoch": 0.740890337843686, "grad_norm": 1.2046093940734863, "learning_rate": 5.1348458755779706e-05, "loss": 4.2083, "step": 3210 }, { "epoch": 0.7431984074319841, "grad_norm": 1.2073742151260376, "learning_rate": 5.049220824747306e-05, "loss": 4.2024, "step": 3220 }, { "epoch": 0.7455064770202822, "grad_norm": 1.1986374855041504, "learning_rate": 4.964170994100019e-05, "loss": 4.2975, "step": 3230 }, { "epoch": 0.7478145466085803, "grad_norm": 1.2024635076522827, "learning_rate": 4.879701300073134e-05, "loss": 4.2499, "step": 3240 }, { "epoch": 0.7501226161968784, "grad_norm": 1.20681893825531, "learning_rate": 4.7958166255679787e-05, "loss": 4.2109, "step": 3250 }, { "epoch": 0.7524306857851765, "grad_norm": 1.2092540264129639, "learning_rate": 4.712521819667936e-05, "loss": 4.2221, "step": 3260 }, { "epoch": 0.7547387553734745, "grad_norm": 1.1802047491073608, "learning_rate": 4.629821697358108e-05, "loss": 4.202, "step": 3270 }, { "epoch": 0.7570468249617726, "grad_norm": 1.237534999847412, "learning_rate": 4.5477210392469944e-05, "loss": 4.2039, "step": 3280 }, { "epoch": 0.7593548945500707, "grad_norm": 1.212430715560913, "learning_rate": 4.4662245912901364e-05, "loss": 4.2043, "step": 3290 }, { "epoch": 0.7616629641383688, "grad_norm": 1.2303744554519653, "learning_rate": 4.38533706451579e-05, "loss": 4.2249, "step": 3300 }, { "epoch": 0.7639710337266669, "grad_norm": 1.2151821851730347, "learning_rate": 4.305063134752559e-05, "loss": 4.2416, "step": 3310 }, { "epoch": 0.766279103314965, "grad_norm": 1.1624999046325684, "learning_rate": 4.225407442359134e-05, "loss": 4.248, "step": 3320 }, { "epoch": 0.7685871729032631, "grad_norm": 1.1886614561080933, "learning_rate": 4.1463745919560296e-05, "loss": 4.1549, "step": 3330 }, { "epoch": 0.7708952424915612, "grad_norm": 1.141176700592041, "learning_rate": 4.067969152159433e-05, "loss": 4.1967, "step": 3340 }, { "epoch": 0.7732033120798593, "grad_norm": 1.1527976989746094, "learning_rate": 3.9901956553170714e-05, "loss": 4.193, "step": 3350 }, { "epoch": 0.7755113816681573, "grad_norm": 1.2025054693222046, "learning_rate": 3.913058597246242e-05, "loss": 4.1946, "step": 3360 }, { "epoch": 0.7778194512564554, "grad_norm": 1.13848078250885, "learning_rate": 3.836562436973906e-05, "loss": 4.1719, "step": 3370 }, { "epoch": 0.7801275208447535, "grad_norm": 1.2138718366622925, "learning_rate": 3.7607115964789537e-05, "loss": 4.2069, "step": 3380 }, { "epoch": 0.7824355904330516, "grad_norm": 1.1834176778793335, "learning_rate": 3.6855104604365485e-05, "loss": 4.2246, "step": 3390 }, { "epoch": 0.7847436600213497, "grad_norm": 1.157386064529419, "learning_rate": 3.610963375964694e-05, "loss": 4.2147, "step": 3400 }, { "epoch": 0.7870517296096478, "grad_norm": 1.251615285873413, "learning_rate": 3.5370746523729215e-05, "loss": 4.2354, "step": 3410 }, { "epoch": 0.7893597991979459, "grad_norm": 1.279531478881836, "learning_rate": 3.463848560913199e-05, "loss": 4.2083, "step": 3420 }, { "epoch": 0.791667868786244, "grad_norm": 1.2485535144805908, "learning_rate": 3.391289334533026e-05, "loss": 4.1657, "step": 3430 }, { "epoch": 0.793975938374542, "grad_norm": 1.2322598695755005, "learning_rate": 3.3194011676307234e-05, "loss": 4.1474, "step": 3440 }, { "epoch": 0.79628400796284, "grad_norm": 1.2795343399047852, "learning_rate": 3.248188215812985e-05, "loss": 4.1557, "step": 3450 }, { "epoch": 0.7985920775511381, "grad_norm": 1.236671805381775, "learning_rate": 3.1776545956546473e-05, "loss": 4.1628, "step": 3460 }, { "epoch": 0.8009001471394362, "grad_norm": 1.163258671760559, "learning_rate": 3.107804384460745e-05, "loss": 4.2085, "step": 3470 }, { "epoch": 0.8013617610570959, "eval_loss": 4.39634370803833, "eval_runtime": 39.8062, "eval_samples_per_second": 628.043, "eval_steps_per_second": 78.505, "step": 3472 }, { "epoch": 0.8032082167277343, "grad_norm": 1.2136709690093994, "learning_rate": 3.0386416200307772e-05, "loss": 4.2476, "step": 3480 }, { "epoch": 0.8055162863160324, "grad_norm": 1.214560866355896, "learning_rate": 2.970170300425341e-05, "loss": 4.1994, "step": 3490 }, { "epoch": 0.8078243559043305, "grad_norm": 1.2116317749023438, "learning_rate": 2.9023943837349795e-05, "loss": 4.1864, "step": 3500 }, { "epoch": 0.8101324254926286, "grad_norm": 1.2190097570419312, "learning_rate": 2.835317787851411e-05, "loss": 4.2019, "step": 3510 }, { "epoch": 0.8124404950809266, "grad_norm": 1.2359529733657837, "learning_rate": 2.768944390241012e-05, "loss": 4.1716, "step": 3520 }, { "epoch": 0.8147485646692247, "grad_norm": 1.255321979522705, "learning_rate": 2.703278027720713e-05, "loss": 4.1866, "step": 3530 }, { "epoch": 0.8170566342575228, "grad_norm": 1.2041914463043213, "learning_rate": 2.6383224962361766e-05, "loss": 4.2161, "step": 3540 }, { "epoch": 0.8193647038458209, "grad_norm": 1.2864853143692017, "learning_rate": 2.5740815506423917e-05, "loss": 4.1654, "step": 3550 }, { "epoch": 0.821672773434119, "grad_norm": 1.3623309135437012, "learning_rate": 2.51055890448658e-05, "loss": 4.2003, "step": 3560 }, { "epoch": 0.8239808430224171, "grad_norm": 1.291591763496399, "learning_rate": 2.44775822979358e-05, "loss": 4.159, "step": 3570 }, { "epoch": 0.8262889126107152, "grad_norm": 1.2855194807052612, "learning_rate": 2.3856831568535307e-05, "loss": 4.1886, "step": 3580 }, { "epoch": 0.8285969821990133, "grad_norm": 1.2652373313903809, "learning_rate": 2.324337274012061e-05, "loss": 4.1722, "step": 3590 }, { "epoch": 0.8309050517873113, "grad_norm": 1.2906244993209839, "learning_rate": 2.2637241274628108e-05, "loss": 4.1888, "step": 3600 }, { "epoch": 0.8332131213756094, "grad_norm": 1.2488876581192017, "learning_rate": 2.2038472210424952e-05, "loss": 4.2159, "step": 3610 }, { "epoch": 0.8355211909639075, "grad_norm": 1.2642358541488647, "learning_rate": 2.1447100160283082e-05, "loss": 4.1982, "step": 3620 }, { "epoch": 0.8378292605522056, "grad_norm": 1.2974900007247925, "learning_rate": 2.0863159309378657e-05, "loss": 4.2046, "step": 3630 }, { "epoch": 0.8401373301405037, "grad_norm": 1.2382539510726929, "learning_rate": 2.0286683413315873e-05, "loss": 4.1495, "step": 3640 }, { "epoch": 0.8424453997288018, "grad_norm": 1.25460946559906, "learning_rate": 1.9717705796175727e-05, "loss": 4.2023, "step": 3650 }, { "epoch": 0.8447534693170999, "grad_norm": 1.22829270362854, "learning_rate": 1.9156259348589514e-05, "loss": 4.1346, "step": 3660 }, { "epoch": 0.847061538905398, "grad_norm": 1.2403064966201782, "learning_rate": 1.8602376525837655e-05, "loss": 4.1988, "step": 3670 }, { "epoch": 0.8493696084936961, "grad_norm": 1.2172107696533203, "learning_rate": 1.8056089345973536e-05, "loss": 4.2222, "step": 3680 }, { "epoch": 0.8516776780819941, "grad_norm": 1.2095916271209717, "learning_rate": 1.7517429387972608e-05, "loss": 4.1647, "step": 3690 }, { "epoch": 0.8539857476702922, "grad_norm": 1.2624644041061401, "learning_rate": 1.6986427789907115e-05, "loss": 4.2337, "step": 3700 }, { "epoch": 0.8562938172585903, "grad_norm": 1.2449021339416504, "learning_rate": 1.6463115247145782e-05, "loss": 4.1926, "step": 3710 }, { "epoch": 0.8586018868468884, "grad_norm": 1.2478352785110474, "learning_rate": 1.594752201057968e-05, "loss": 4.1702, "step": 3720 }, { "epoch": 0.8609099564351865, "grad_norm": 1.263993740081787, "learning_rate": 1.5439677884873424e-05, "loss": 4.1817, "step": 3730 }, { "epoch": 0.8632180260234846, "grad_norm": 1.1771023273468018, "learning_rate": 1.4939612226742347e-05, "loss": 4.1884, "step": 3740 }, { "epoch": 0.8655260956117827, "grad_norm": 1.2805135250091553, "learning_rate": 1.4447353943255341e-05, "loss": 4.1973, "step": 3750 }, { "epoch": 0.8678341652000808, "grad_norm": 1.2390002012252808, "learning_rate": 1.3962931490163992e-05, "loss": 4.163, "step": 3760 }, { "epoch": 0.8701422347883788, "grad_norm": 1.1928465366363525, "learning_rate": 1.3486372870257539e-05, "loss": 4.2661, "step": 3770 }, { "epoch": 0.8724503043766769, "grad_norm": 1.227087378501892, "learning_rate": 1.3017705631744263e-05, "loss": 4.1941, "step": 3780 }, { "epoch": 0.874758373964975, "grad_norm": 1.2422763109207153, "learning_rate": 1.255695686665883e-05, "loss": 4.1729, "step": 3790 }, { "epoch": 0.8770664435532731, "grad_norm": 1.2221300601959229, "learning_rate": 1.2104153209296374e-05, "loss": 4.1766, "step": 3800 }, { "epoch": 0.8793745131415712, "grad_norm": 1.2882457971572876, "learning_rate": 1.1659320834672753e-05, "loss": 4.2104, "step": 3810 }, { "epoch": 0.8816825827298693, "grad_norm": 1.3336913585662842, "learning_rate": 1.1222485457011516e-05, "loss": 4.2294, "step": 3820 }, { "epoch": 0.8839906523181674, "grad_norm": 1.1365299224853516, "learning_rate": 1.079367232825743e-05, "loss": 4.1763, "step": 3830 }, { "epoch": 0.8862987219064655, "grad_norm": 1.3608059883117676, "learning_rate": 1.0372906236616734e-05, "loss": 4.2236, "step": 3840 }, { "epoch": 0.8886067914947635, "grad_norm": 1.3224635124206543, "learning_rate": 9.960211505124215e-06, "loss": 4.1455, "step": 3850 }, { "epoch": 0.8909148610830616, "grad_norm": 1.2558730840682983, "learning_rate": 9.555611990237317e-06, "loss": 4.1669, "step": 3860 }, { "epoch": 0.8932229306713597, "grad_norm": 1.2785941362380981, "learning_rate": 9.159131080456839e-06, "loss": 4.1952, "step": 3870 }, { "epoch": 0.8955310002596578, "grad_norm": 1.2829550504684448, "learning_rate": 8.770791694975093e-06, "loss": 4.1654, "step": 3880 }, { "epoch": 0.8978390698479559, "grad_norm": 1.3000894784927368, "learning_rate": 8.390616282350992e-06, "loss": 4.223, "step": 3890 }, { "epoch": 0.900147139436254, "grad_norm": 1.4616061449050903, "learning_rate": 8.018626819212354e-06, "loss": 4.1981, "step": 3900 }, { "epoch": 0.9015319811892328, "eval_loss": 4.381021022796631, "eval_runtime": 39.549, "eval_samples_per_second": 632.127, "eval_steps_per_second": 79.016, "step": 3906 }, { "epoch": 0.9024552090245521, "grad_norm": 1.295082449913025, "learning_rate": 7.654844808985617e-06, "loss": 4.1607, "step": 3910 }, { "epoch": 0.9047632786128502, "grad_norm": 1.3329360485076904, "learning_rate": 7.299291280652503e-06, "loss": 4.1855, "step": 3920 }, { "epoch": 0.9070713482011483, "grad_norm": 1.2815297842025757, "learning_rate": 6.951986787534824e-06, "loss": 4.2036, "step": 3930 }, { "epoch": 0.9093794177894463, "grad_norm": 1.2049307823181152, "learning_rate": 6.612951406106015e-06, "loss": 4.1678, "step": 3940 }, { "epoch": 0.9116874873777444, "grad_norm": 1.3223621845245361, "learning_rate": 6.282204734830826e-06, "loss": 4.1758, "step": 3950 }, { "epoch": 0.9139955569660425, "grad_norm": 1.3526756763458252, "learning_rate": 5.959765893032131e-06, "loss": 4.1829, "step": 3960 }, { "epoch": 0.9163036265543406, "grad_norm": 1.3002768754959106, "learning_rate": 5.645653519786064e-06, "loss": 4.1908, "step": 3970 }, { "epoch": 0.9186116961426387, "grad_norm": 1.294180989265442, "learning_rate": 5.339885772844227e-06, "loss": 4.1767, "step": 3980 }, { "epoch": 0.9209197657309368, "grad_norm": 1.2366904020309448, "learning_rate": 5.042480327584231e-06, "loss": 4.1572, "step": 3990 }, { "epoch": 0.9232278353192349, "grad_norm": 1.2950676679611206, "learning_rate": 4.753454375987898e-06, "loss": 4.1748, "step": 4000 }, { "epoch": 0.925535904907533, "grad_norm": 1.3021730184555054, "learning_rate": 4.472824625647503e-06, "loss": 4.1417, "step": 4010 }, { "epoch": 0.927843974495831, "grad_norm": 1.3200892210006714, "learning_rate": 4.2006072987998355e-06, "loss": 4.1742, "step": 4020 }, { "epoch": 0.9301520440841291, "grad_norm": 1.2840920686721802, "learning_rate": 3.9368181313886085e-06, "loss": 4.1808, "step": 4030 }, { "epoch": 0.9324601136724272, "grad_norm": 1.2291038036346436, "learning_rate": 3.681472372154659e-06, "loss": 4.2205, "step": 4040 }, { "epoch": 0.9347681832607253, "grad_norm": 1.2467334270477295, "learning_rate": 3.434584781754668e-06, "loss": 4.2397, "step": 4050 }, { "epoch": 0.9370762528490234, "grad_norm": 1.292160153388977, "learning_rate": 3.196169631907658e-06, "loss": 4.1912, "step": 4060 }, { "epoch": 0.9393843224373215, "grad_norm": 1.3593335151672363, "learning_rate": 2.966240704570205e-06, "loss": 4.1743, "step": 4070 }, { "epoch": 0.9416923920256196, "grad_norm": 1.2135837078094482, "learning_rate": 2.7448112911396257e-06, "loss": 4.1725, "step": 4080 }, { "epoch": 0.9440004616139177, "grad_norm": 1.2826229333877563, "learning_rate": 2.5318941916857236e-06, "loss": 4.1779, "step": 4090 }, { "epoch": 0.9463085312022157, "grad_norm": 1.3116379976272583, "learning_rate": 2.327501714210783e-06, "loss": 4.1753, "step": 4100 }, { "epoch": 0.9486166007905138, "grad_norm": 1.2221295833587646, "learning_rate": 2.1316456739381373e-06, "loss": 4.1947, "step": 4110 }, { "epoch": 0.9509246703788119, "grad_norm": 1.2066534757614136, "learning_rate": 1.9443373926291806e-06, "loss": 4.1914, "step": 4120 }, { "epoch": 0.95323273996711, "grad_norm": 1.2396924495697021, "learning_rate": 1.765587697928844e-06, "loss": 4.1717, "step": 4130 }, { "epoch": 0.9555408095554081, "grad_norm": 1.2560229301452637, "learning_rate": 1.5954069227397782e-06, "loss": 4.1566, "step": 4140 }, { "epoch": 0.9578488791437062, "grad_norm": 1.4303022623062134, "learning_rate": 1.43380490462493e-06, "loss": 4.1762, "step": 4150 }, { "epoch": 0.9601569487320043, "grad_norm": 1.2595306634902954, "learning_rate": 1.2807909852389498e-06, "loss": 4.1753, "step": 4160 }, { "epoch": 0.9624650183203024, "grad_norm": 1.1923511028289795, "learning_rate": 1.1363740097881547e-06, "loss": 4.1417, "step": 4170 }, { "epoch": 0.9647730879086004, "grad_norm": 1.1929636001586914, "learning_rate": 1.0005623265192053e-06, "loss": 4.1698, "step": 4180 }, { "epoch": 0.9670811574968985, "grad_norm": 1.2409882545471191, "learning_rate": 8.733637862365251e-07, "loss": 4.1773, "step": 4190 }, { "epoch": 0.9693892270851966, "grad_norm": 1.2641384601593018, "learning_rate": 7.547857418485459e-07, "loss": 4.1271, "step": 4200 }, { "epoch": 0.9716972966734947, "grad_norm": 1.1365667581558228, "learning_rate": 6.448350479425157e-07, "loss": 4.14, "step": 4210 }, { "epoch": 0.9740053662617928, "grad_norm": 1.2669204473495483, "learning_rate": 5.435180603884148e-07, "loss": 4.1989, "step": 4220 }, { "epoch": 0.9763134358500909, "grad_norm": 1.2706636190414429, "learning_rate": 4.508406359714001e-07, "loss": 4.1727, "step": 4230 }, { "epoch": 0.978621505438389, "grad_norm": 1.293450951576233, "learning_rate": 3.6680813205339223e-07, "loss": 4.1872, "step": 4240 }, { "epoch": 0.9809295750266871, "grad_norm": 1.2493462562561035, "learning_rate": 2.9142540626325707e-07, "loss": 4.1937, "step": 4250 }, { "epoch": 0.9832376446149852, "grad_norm": 1.265554428100586, "learning_rate": 2.246968162160634e-07, "loss": 4.1332, "step": 4260 }, { "epoch": 0.9855457142032832, "grad_norm": 1.2552322149276733, "learning_rate": 1.6662621926118446e-07, "loss": 4.221, "step": 4270 }, { "epoch": 0.9878537837915813, "grad_norm": 1.2385802268981934, "learning_rate": 1.1721697225932636e-07, "loss": 4.1898, "step": 4280 }, { "epoch": 0.9901618533798794, "grad_norm": 1.358494520187378, "learning_rate": 7.647193138843322e-08, "loss": 4.1357, "step": 4290 }, { "epoch": 0.9924699229681775, "grad_norm": 1.174631953239441, "learning_rate": 4.439345197861932e-08, "loss": 4.1818, "step": 4300 }, { "epoch": 0.9947779925564756, "grad_norm": 1.249579668045044, "learning_rate": 2.0983388376011324e-08, "loss": 4.1376, "step": 4310 }, { "epoch": 0.9970860621447737, "grad_norm": 1.2420294284820557, "learning_rate": 6.243093835567314e-09, "loss": 4.1601, "step": 4320 }, { "epoch": 0.9993941317330718, "grad_norm": 1.2425047159194946, "learning_rate": 1.734204427727981e-10, "loss": 4.1714, "step": 4330 } ], "logging_steps": 10, "max_steps": 4332, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 434, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2436421205975040.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }