{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 18564, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00808015513897867, "grad_norm": 40.89574432373047, "learning_rate": 1.995151906916613e-05, "loss": 25.8011, "step": 50 }, { "epoch": 0.01616031027795734, "grad_norm": 1.4409021139144897, "learning_rate": 1.9897651368239605e-05, "loss": 2.011, "step": 100 }, { "epoch": 0.024240465416936006, "grad_norm": 0.27292221784591675, "learning_rate": 1.9843783667313083e-05, "loss": 0.1682, "step": 150 }, { "epoch": 0.03232062055591468, "grad_norm": 0.21620342135429382, "learning_rate": 1.9789915966386557e-05, "loss": 0.1127, "step": 200 }, { "epoch": 0.040400775694893344, "grad_norm": 0.19684338569641113, "learning_rate": 1.9736048265460034e-05, "loss": 0.0924, "step": 250 }, { "epoch": 0.04848093083387201, "grad_norm": 0.21188506484031677, "learning_rate": 1.9682180564533505e-05, "loss": 0.0768, "step": 300 }, { "epoch": 0.05656108597285068, "grad_norm": 0.16330359876155853, "learning_rate": 1.9628312863606982e-05, "loss": 0.0596, "step": 350 }, { "epoch": 0.06464124111182935, "grad_norm": 0.12760096788406372, "learning_rate": 1.9574445162680457e-05, "loss": 0.0491, "step": 400 }, { "epoch": 0.07272139625080802, "grad_norm": 0.16558507084846497, "learning_rate": 1.9520577461753934e-05, "loss": 0.0424, "step": 450 }, { "epoch": 0.08080155138978669, "grad_norm": 0.1511087417602539, "learning_rate": 1.9466709760827408e-05, "loss": 0.0376, "step": 500 }, { "epoch": 0.08888170652876536, "grad_norm": 0.09860303997993469, "learning_rate": 1.9412842059900886e-05, "loss": 0.0333, "step": 550 }, { "epoch": 0.09696186166774402, "grad_norm": 0.1415504515171051, "learning_rate": 1.935897435897436e-05, "loss": 0.0299, "step": 600 }, { "epoch": 0.10504201680672269, "grad_norm": 0.12042852491140366, "learning_rate": 1.9305106658047837e-05, "loss": 0.0298, "step": 650 }, { "epoch": 0.11312217194570136, "grad_norm": 0.14820343255996704, "learning_rate": 1.925123895712131e-05, "loss": 0.0297, "step": 700 }, { "epoch": 0.12120232708468003, "grad_norm": 0.11845523118972778, "learning_rate": 1.919737125619479e-05, "loss": 0.024, "step": 750 }, { "epoch": 0.1292824822236587, "grad_norm": 0.13682147860527039, "learning_rate": 1.9143503555268263e-05, "loss": 0.024, "step": 800 }, { "epoch": 0.13736263736263737, "grad_norm": 0.07670263946056366, "learning_rate": 1.908963585434174e-05, "loss": 0.0216, "step": 850 }, { "epoch": 0.14544279250161604, "grad_norm": 0.10427036881446838, "learning_rate": 1.903576815341521e-05, "loss": 0.0229, "step": 900 }, { "epoch": 0.1535229476405947, "grad_norm": 0.14853981137275696, "learning_rate": 1.898190045248869e-05, "loss": 0.0204, "step": 950 }, { "epoch": 0.16160310277957338, "grad_norm": 0.08788396418094635, "learning_rate": 1.8928032751562163e-05, "loss": 0.0201, "step": 1000 }, { "epoch": 0.16968325791855204, "grad_norm": 0.1365722268819809, "learning_rate": 1.887416505063564e-05, "loss": 0.019, "step": 1050 }, { "epoch": 0.1777634130575307, "grad_norm": 0.133611798286438, "learning_rate": 1.8820297349709114e-05, "loss": 0.0212, "step": 1100 }, { "epoch": 0.18584356819650938, "grad_norm": 0.09723819047212601, "learning_rate": 1.8766429648782592e-05, "loss": 0.0169, "step": 1150 }, { "epoch": 0.19392372333548805, "grad_norm": 0.11186394095420837, "learning_rate": 1.8712561947856066e-05, "loss": 0.0178, "step": 1200 }, { "epoch": 0.2020038784744667, "grad_norm": 0.10648094117641449, "learning_rate": 1.8658694246929544e-05, "loss": 0.0172, "step": 1250 }, { "epoch": 0.21008403361344538, "grad_norm": 0.10959897935390472, "learning_rate": 1.8604826546003018e-05, "loss": 0.0178, "step": 1300 }, { "epoch": 0.21816418875242405, "grad_norm": 0.09298472851514816, "learning_rate": 1.8550958845076495e-05, "loss": 0.0154, "step": 1350 }, { "epoch": 0.22624434389140272, "grad_norm": 0.08410840481519699, "learning_rate": 1.849709114414997e-05, "loss": 0.0162, "step": 1400 }, { "epoch": 0.23432449903038138, "grad_norm": 0.15365834534168243, "learning_rate": 1.8443223443223447e-05, "loss": 0.0148, "step": 1450 }, { "epoch": 0.24240465416936005, "grad_norm": 0.09604126960039139, "learning_rate": 1.838935574229692e-05, "loss": 0.0146, "step": 1500 }, { "epoch": 0.2504848093083387, "grad_norm": 0.06805470585823059, "learning_rate": 1.8335488041370395e-05, "loss": 0.0162, "step": 1550 }, { "epoch": 0.2585649644473174, "grad_norm": 0.11596197634935379, "learning_rate": 1.8281620340443872e-05, "loss": 0.0112, "step": 1600 }, { "epoch": 0.26664511958629605, "grad_norm": 0.08166597783565521, "learning_rate": 1.8227752639517347e-05, "loss": 0.0136, "step": 1650 }, { "epoch": 0.27472527472527475, "grad_norm": 0.08969355374574661, "learning_rate": 1.817388493859082e-05, "loss": 0.0132, "step": 1700 }, { "epoch": 0.2828054298642534, "grad_norm": 0.0873119905591011, "learning_rate": 1.8120017237664298e-05, "loss": 0.0129, "step": 1750 }, { "epoch": 0.2908855850032321, "grad_norm": 0.10579364001750946, "learning_rate": 1.8066149536737772e-05, "loss": 0.0123, "step": 1800 }, { "epoch": 0.2989657401422107, "grad_norm": 0.12571881711483002, "learning_rate": 1.801228183581125e-05, "loss": 0.0128, "step": 1850 }, { "epoch": 0.3070458952811894, "grad_norm": 0.08085139840841293, "learning_rate": 1.7958414134884724e-05, "loss": 0.0121, "step": 1900 }, { "epoch": 0.31512605042016806, "grad_norm": 0.06765197962522507, "learning_rate": 1.79045464339582e-05, "loss": 0.0103, "step": 1950 }, { "epoch": 0.32320620555914675, "grad_norm": 0.07517407834529877, "learning_rate": 1.7850678733031676e-05, "loss": 0.0107, "step": 2000 }, { "epoch": 0.3312863606981254, "grad_norm": 0.060215964913368225, "learning_rate": 1.779681103210515e-05, "loss": 0.0123, "step": 2050 }, { "epoch": 0.3393665158371041, "grad_norm": 0.08518676459789276, "learning_rate": 1.7742943331178627e-05, "loss": 0.0107, "step": 2100 }, { "epoch": 0.3474466709760827, "grad_norm": 0.05002804845571518, "learning_rate": 1.76890756302521e-05, "loss": 0.0108, "step": 2150 }, { "epoch": 0.3555268261150614, "grad_norm": 0.042444173246622086, "learning_rate": 1.763520792932558e-05, "loss": 0.01, "step": 2200 }, { "epoch": 0.36360698125404006, "grad_norm": 0.07093804329633713, "learning_rate": 1.7581340228399053e-05, "loss": 0.0113, "step": 2250 }, { "epoch": 0.37168713639301876, "grad_norm": 0.059032320976257324, "learning_rate": 1.752747252747253e-05, "loss": 0.011, "step": 2300 }, { "epoch": 0.3797672915319974, "grad_norm": 0.05217365920543671, "learning_rate": 1.7473604826546004e-05, "loss": 0.0097, "step": 2350 }, { "epoch": 0.3878474466709761, "grad_norm": 0.12220600247383118, "learning_rate": 1.7419737125619482e-05, "loss": 0.01, "step": 2400 }, { "epoch": 0.39592760180995473, "grad_norm": 0.09106040000915527, "learning_rate": 1.7365869424692956e-05, "loss": 0.0117, "step": 2450 }, { "epoch": 0.4040077569489334, "grad_norm": 0.09308184683322906, "learning_rate": 1.731200172376643e-05, "loss": 0.01, "step": 2500 }, { "epoch": 0.41208791208791207, "grad_norm": 0.08041771501302719, "learning_rate": 1.7258134022839908e-05, "loss": 0.0112, "step": 2550 }, { "epoch": 0.42016806722689076, "grad_norm": 0.05206672102212906, "learning_rate": 1.7204266321913382e-05, "loss": 0.0103, "step": 2600 }, { "epoch": 0.4282482223658694, "grad_norm": 0.0851343497633934, "learning_rate": 1.7150398620986856e-05, "loss": 0.0093, "step": 2650 }, { "epoch": 0.4363283775048481, "grad_norm": 0.08132291585206985, "learning_rate": 1.7096530920060333e-05, "loss": 0.0118, "step": 2700 }, { "epoch": 0.44440853264382674, "grad_norm": 0.06463415920734406, "learning_rate": 1.7042663219133807e-05, "loss": 0.0106, "step": 2750 }, { "epoch": 0.45248868778280543, "grad_norm": 0.08292311429977417, "learning_rate": 1.6988795518207285e-05, "loss": 0.0097, "step": 2800 }, { "epoch": 0.46056884292178407, "grad_norm": 0.10132889449596405, "learning_rate": 1.693492781728076e-05, "loss": 0.009, "step": 2850 }, { "epoch": 0.46864899806076277, "grad_norm": 0.06622787564992905, "learning_rate": 1.6881060116354237e-05, "loss": 0.0092, "step": 2900 }, { "epoch": 0.47672915319974146, "grad_norm": 0.055822569876909256, "learning_rate": 1.682719241542771e-05, "loss": 0.0088, "step": 2950 }, { "epoch": 0.4848093083387201, "grad_norm": 0.04172702878713608, "learning_rate": 1.6773324714501188e-05, "loss": 0.0081, "step": 3000 }, { "epoch": 0.4928894634776988, "grad_norm": 0.08130078762769699, "learning_rate": 1.6719457013574662e-05, "loss": 0.0085, "step": 3050 }, { "epoch": 0.5009696186166774, "grad_norm": 0.048671796917915344, "learning_rate": 1.666558931264814e-05, "loss": 0.0079, "step": 3100 }, { "epoch": 0.5090497737556561, "grad_norm": 0.12270870059728622, "learning_rate": 1.6611721611721614e-05, "loss": 0.0084, "step": 3150 }, { "epoch": 0.5171299288946348, "grad_norm": 0.053976595401763916, "learning_rate": 1.6557853910795088e-05, "loss": 0.0095, "step": 3200 }, { "epoch": 0.5252100840336135, "grad_norm": 0.09643538296222687, "learning_rate": 1.6503986209868562e-05, "loss": 0.0086, "step": 3250 }, { "epoch": 0.5332902391725921, "grad_norm": 0.04082519933581352, "learning_rate": 1.645011850894204e-05, "loss": 0.0085, "step": 3300 }, { "epoch": 0.5413703943115707, "grad_norm": 0.07320540398359299, "learning_rate": 1.6396250808015514e-05, "loss": 0.0095, "step": 3350 }, { "epoch": 0.5494505494505495, "grad_norm": 0.08433382958173752, "learning_rate": 1.634238310708899e-05, "loss": 0.0086, "step": 3400 }, { "epoch": 0.5575307045895281, "grad_norm": 0.08490192890167236, "learning_rate": 1.6288515406162465e-05, "loss": 0.0087, "step": 3450 }, { "epoch": 0.5656108597285068, "grad_norm": 0.07407897710800171, "learning_rate": 1.6234647705235943e-05, "loss": 0.0095, "step": 3500 }, { "epoch": 0.5736910148674854, "grad_norm": 0.04006027430295944, "learning_rate": 1.6180780004309417e-05, "loss": 0.0085, "step": 3550 }, { "epoch": 0.5817711700064642, "grad_norm": 0.10100733488798141, "learning_rate": 1.6126912303382894e-05, "loss": 0.0082, "step": 3600 }, { "epoch": 0.5898513251454428, "grad_norm": 0.046667344868183136, "learning_rate": 1.607304460245637e-05, "loss": 0.0088, "step": 3650 }, { "epoch": 0.5979314802844214, "grad_norm": 0.07386622577905655, "learning_rate": 1.6019176901529846e-05, "loss": 0.0097, "step": 3700 }, { "epoch": 0.6060116354234001, "grad_norm": 0.07727360725402832, "learning_rate": 1.596530920060332e-05, "loss": 0.0078, "step": 3750 }, { "epoch": 0.6140917905623788, "grad_norm": 0.01824168674647808, "learning_rate": 1.5911441499676794e-05, "loss": 0.0082, "step": 3800 }, { "epoch": 0.6221719457013575, "grad_norm": 0.02981133572757244, "learning_rate": 1.585757379875027e-05, "loss": 0.0077, "step": 3850 }, { "epoch": 0.6302521008403361, "grad_norm": 0.07947336137294769, "learning_rate": 1.5803706097823746e-05, "loss": 0.0073, "step": 3900 }, { "epoch": 0.6383322559793148, "grad_norm": 0.04702315106987953, "learning_rate": 1.574983839689722e-05, "loss": 0.0074, "step": 3950 }, { "epoch": 0.6464124111182935, "grad_norm": 0.0397706963121891, "learning_rate": 1.5695970695970697e-05, "loss": 0.0085, "step": 4000 }, { "epoch": 0.6544925662572721, "grad_norm": 0.0445311963558197, "learning_rate": 1.564210299504417e-05, "loss": 0.0071, "step": 4050 }, { "epoch": 0.6625727213962508, "grad_norm": 0.03868868947029114, "learning_rate": 1.558823529411765e-05, "loss": 0.0071, "step": 4100 }, { "epoch": 0.6706528765352294, "grad_norm": 0.09177325665950775, "learning_rate": 1.5534367593191123e-05, "loss": 0.0078, "step": 4150 }, { "epoch": 0.6787330316742082, "grad_norm": 0.05718887969851494, "learning_rate": 1.54804998922646e-05, "loss": 0.008, "step": 4200 }, { "epoch": 0.6868131868131868, "grad_norm": 0.04949459806084633, "learning_rate": 1.5426632191338075e-05, "loss": 0.0073, "step": 4250 }, { "epoch": 0.6948933419521655, "grad_norm": 0.04109389707446098, "learning_rate": 1.5372764490411552e-05, "loss": 0.0073, "step": 4300 }, { "epoch": 0.7029734970911441, "grad_norm": 0.07063832134008408, "learning_rate": 1.5318896789485026e-05, "loss": 0.0083, "step": 4350 }, { "epoch": 0.7110536522301228, "grad_norm": 0.07452435046434402, "learning_rate": 1.52650290885585e-05, "loss": 0.0073, "step": 4400 }, { "epoch": 0.7191338073691015, "grad_norm": 0.04412061721086502, "learning_rate": 1.5211161387631976e-05, "loss": 0.0067, "step": 4450 }, { "epoch": 0.7272139625080801, "grad_norm": 0.07454030215740204, "learning_rate": 1.5157293686705452e-05, "loss": 0.0075, "step": 4500 }, { "epoch": 0.7352941176470589, "grad_norm": 0.04662289470434189, "learning_rate": 1.5103425985778928e-05, "loss": 0.0074, "step": 4550 }, { "epoch": 0.7433742727860375, "grad_norm": 0.05991057679057121, "learning_rate": 1.5049558284852404e-05, "loss": 0.0071, "step": 4600 }, { "epoch": 0.7514544279250162, "grad_norm": 0.04592648521065712, "learning_rate": 1.499569058392588e-05, "loss": 0.0076, "step": 4650 }, { "epoch": 0.7595345830639948, "grad_norm": 0.06309536099433899, "learning_rate": 1.4941822882999355e-05, "loss": 0.0076, "step": 4700 }, { "epoch": 0.7676147382029735, "grad_norm": 0.05248362198472023, "learning_rate": 1.4887955182072831e-05, "loss": 0.0075, "step": 4750 }, { "epoch": 0.7756948933419522, "grad_norm": 0.04493599757552147, "learning_rate": 1.4834087481146307e-05, "loss": 0.007, "step": 4800 }, { "epoch": 0.7837750484809308, "grad_norm": 0.020138224586844444, "learning_rate": 1.4780219780219783e-05, "loss": 0.0063, "step": 4850 }, { "epoch": 0.7918552036199095, "grad_norm": 0.073582723736763, "learning_rate": 1.4726352079293259e-05, "loss": 0.0063, "step": 4900 }, { "epoch": 0.7999353587588882, "grad_norm": 0.050810813903808594, "learning_rate": 1.4672484378366731e-05, "loss": 0.0079, "step": 4950 }, { "epoch": 0.8080155138978669, "grad_norm": 0.09147041290998459, "learning_rate": 1.4618616677440207e-05, "loss": 0.007, "step": 5000 }, { "epoch": 0.8160956690368455, "grad_norm": 0.02943544089794159, "learning_rate": 1.4564748976513683e-05, "loss": 0.0064, "step": 5050 }, { "epoch": 0.8241758241758241, "grad_norm": 0.058280181139707565, "learning_rate": 1.4510881275587158e-05, "loss": 0.0063, "step": 5100 }, { "epoch": 0.8322559793148029, "grad_norm": 0.05846063792705536, "learning_rate": 1.4457013574660634e-05, "loss": 0.0071, "step": 5150 }, { "epoch": 0.8403361344537815, "grad_norm": 0.08794039487838745, "learning_rate": 1.440314587373411e-05, "loss": 0.0066, "step": 5200 }, { "epoch": 0.8484162895927602, "grad_norm": 0.05881703272461891, "learning_rate": 1.4349278172807586e-05, "loss": 0.0075, "step": 5250 }, { "epoch": 0.8564964447317388, "grad_norm": 0.060504719614982605, "learning_rate": 1.4295410471881062e-05, "loss": 0.0061, "step": 5300 }, { "epoch": 0.8645765998707176, "grad_norm": 0.04584150016307831, "learning_rate": 1.4241542770954537e-05, "loss": 0.0059, "step": 5350 }, { "epoch": 0.8726567550096962, "grad_norm": 0.0818849429488182, "learning_rate": 1.4187675070028013e-05, "loss": 0.0067, "step": 5400 }, { "epoch": 0.8807369101486748, "grad_norm": 0.04742725193500519, "learning_rate": 1.4133807369101489e-05, "loss": 0.0066, "step": 5450 }, { "epoch": 0.8888170652876535, "grad_norm": 0.06628070026636124, "learning_rate": 1.4079939668174965e-05, "loss": 0.0069, "step": 5500 }, { "epoch": 0.8968972204266322, "grad_norm": 0.0498398132622242, "learning_rate": 1.4026071967248437e-05, "loss": 0.006, "step": 5550 }, { "epoch": 0.9049773755656109, "grad_norm": 0.07333686202764511, "learning_rate": 1.3972204266321913e-05, "loss": 0.0073, "step": 5600 }, { "epoch": 0.9130575307045895, "grad_norm": 0.040425509214401245, "learning_rate": 1.3918336565395389e-05, "loss": 0.0061, "step": 5650 }, { "epoch": 0.9211376858435681, "grad_norm": 0.04927436634898186, "learning_rate": 1.3864468864468865e-05, "loss": 0.0064, "step": 5700 }, { "epoch": 0.9292178409825469, "grad_norm": 0.07403150945901871, "learning_rate": 1.381060116354234e-05, "loss": 0.0057, "step": 5750 }, { "epoch": 0.9372979961215255, "grad_norm": 0.0294960867613554, "learning_rate": 1.3756733462615816e-05, "loss": 0.0057, "step": 5800 }, { "epoch": 0.9453781512605042, "grad_norm": 0.06763947010040283, "learning_rate": 1.3702865761689292e-05, "loss": 0.0061, "step": 5850 }, { "epoch": 0.9534583063994829, "grad_norm": 0.034876059740781784, "learning_rate": 1.3648998060762768e-05, "loss": 0.0073, "step": 5900 }, { "epoch": 0.9615384615384616, "grad_norm": 0.0363958366215229, "learning_rate": 1.3595130359836244e-05, "loss": 0.0065, "step": 5950 }, { "epoch": 0.9696186166774402, "grad_norm": 0.03505223989486694, "learning_rate": 1.354126265890972e-05, "loss": 0.0065, "step": 6000 }, { "epoch": 0.9776987718164188, "grad_norm": 0.010415361262857914, "learning_rate": 1.3487394957983195e-05, "loss": 0.0063, "step": 6050 }, { "epoch": 0.9857789269553976, "grad_norm": 0.05905304104089737, "learning_rate": 1.343352725705667e-05, "loss": 0.0055, "step": 6100 }, { "epoch": 0.9938590820943762, "grad_norm": 0.03133542463183403, "learning_rate": 1.3379659556130145e-05, "loss": 0.0062, "step": 6150 }, { "epoch": 1.0, "eval_loss": 0.00468644080683589, "eval_rouge1": 86.16729085635436, "eval_rouge2": 82.00613853575325, "eval_rougeL": 85.17049406119979, "eval_rougeLsum": 85.1321983661705, "eval_runtime": 50.3158, "eval_samples_per_second": 9.937, "eval_steps_per_second": 1.252, "step": 6188 }, { "epoch": 1.0019392372333549, "grad_norm": 0.0473632887005806, "learning_rate": 1.3325791855203621e-05, "loss": 0.006, "step": 6200 }, { "epoch": 1.0100193923723335, "grad_norm": 0.009613406844437122, "learning_rate": 1.3271924154277097e-05, "loss": 0.0058, "step": 6250 }, { "epoch": 1.0180995475113122, "grad_norm": 0.07586677372455597, "learning_rate": 1.3218056453350573e-05, "loss": 0.0053, "step": 6300 }, { "epoch": 1.0261797026502908, "grad_norm": 0.03996267169713974, "learning_rate": 1.3165266106442578e-05, "loss": 0.0053, "step": 6350 }, { "epoch": 1.0342598577892697, "grad_norm": 0.04150750860571861, "learning_rate": 1.3111398405516054e-05, "loss": 0.006, "step": 6400 }, { "epoch": 1.0423400129282483, "grad_norm": 0.047103989869356155, "learning_rate": 1.305753070458953e-05, "loss": 0.0063, "step": 6450 }, { "epoch": 1.050420168067227, "grad_norm": 0.04707568511366844, "learning_rate": 1.3003663003663005e-05, "loss": 0.0054, "step": 6500 }, { "epoch": 1.0585003232062056, "grad_norm": 0.04693872109055519, "learning_rate": 1.2949795302736481e-05, "loss": 0.0058, "step": 6550 }, { "epoch": 1.0665804783451842, "grad_norm": 0.03767074644565582, "learning_rate": 1.2895927601809957e-05, "loss": 0.0049, "step": 6600 }, { "epoch": 1.0746606334841629, "grad_norm": 0.049506109207868576, "learning_rate": 1.2842059900883433e-05, "loss": 0.0061, "step": 6650 }, { "epoch": 1.0827407886231415, "grad_norm": 0.040579311549663544, "learning_rate": 1.2788192199956907e-05, "loss": 0.0053, "step": 6700 }, { "epoch": 1.0908209437621201, "grad_norm": 0.06605598330497742, "learning_rate": 1.273432449903038e-05, "loss": 0.0049, "step": 6750 }, { "epoch": 1.098901098901099, "grad_norm": 0.0573626309633255, "learning_rate": 1.2680456798103857e-05, "loss": 0.0058, "step": 6800 }, { "epoch": 1.1069812540400776, "grad_norm": 0.04869833588600159, "learning_rate": 1.2626589097177332e-05, "loss": 0.0054, "step": 6850 }, { "epoch": 1.1150614091790563, "grad_norm": 0.030207501724362373, "learning_rate": 1.2572721396250808e-05, "loss": 0.0049, "step": 6900 }, { "epoch": 1.123141564318035, "grad_norm": 0.04019474983215332, "learning_rate": 1.2518853695324284e-05, "loss": 0.0054, "step": 6950 }, { "epoch": 1.1312217194570136, "grad_norm": 0.03316131606698036, "learning_rate": 1.246498599439776e-05, "loss": 0.0063, "step": 7000 }, { "epoch": 1.1393018745959922, "grad_norm": 0.04495147988200188, "learning_rate": 1.2411118293471236e-05, "loss": 0.006, "step": 7050 }, { "epoch": 1.1473820297349708, "grad_norm": 0.029968004673719406, "learning_rate": 1.2357250592544711e-05, "loss": 0.0055, "step": 7100 }, { "epoch": 1.1554621848739495, "grad_norm": 0.05552554875612259, "learning_rate": 1.2303382891618187e-05, "loss": 0.0056, "step": 7150 }, { "epoch": 1.1635423400129283, "grad_norm": 0.040211454033851624, "learning_rate": 1.2249515190691663e-05, "loss": 0.0059, "step": 7200 }, { "epoch": 1.171622495151907, "grad_norm": 0.030323076993227005, "learning_rate": 1.2195647489765139e-05, "loss": 0.0057, "step": 7250 }, { "epoch": 1.1797026502908856, "grad_norm": 0.040990181267261505, "learning_rate": 1.2141779788838613e-05, "loss": 0.0053, "step": 7300 }, { "epoch": 1.1877828054298643, "grad_norm": 0.03895975649356842, "learning_rate": 1.2087912087912089e-05, "loss": 0.0054, "step": 7350 }, { "epoch": 1.195862960568843, "grad_norm": 0.04454527050256729, "learning_rate": 1.2034044386985565e-05, "loss": 0.0053, "step": 7400 }, { "epoch": 1.2039431157078215, "grad_norm": 0.04071690887212753, "learning_rate": 1.198017668605904e-05, "loss": 0.0053, "step": 7450 }, { "epoch": 1.2120232708468002, "grad_norm": 0.03768669813871384, "learning_rate": 1.1926308985132516e-05, "loss": 0.0059, "step": 7500 }, { "epoch": 1.220103425985779, "grad_norm": 0.048281479626894, "learning_rate": 1.187244128420599e-05, "loss": 0.0057, "step": 7550 }, { "epoch": 1.2281835811247577, "grad_norm": 0.037021659314632416, "learning_rate": 1.1818573583279466e-05, "loss": 0.0059, "step": 7600 }, { "epoch": 1.2362637362637363, "grad_norm": 0.060549281537532806, "learning_rate": 1.1764705882352942e-05, "loss": 0.0054, "step": 7650 }, { "epoch": 1.244343891402715, "grad_norm": 0.06345050781965256, "learning_rate": 1.1710838181426418e-05, "loss": 0.0057, "step": 7700 }, { "epoch": 1.2524240465416936, "grad_norm": 0.03186087682843208, "learning_rate": 1.1656970480499893e-05, "loss": 0.0055, "step": 7750 }, { "epoch": 1.2605042016806722, "grad_norm": 0.057463087141513824, "learning_rate": 1.160310277957337e-05, "loss": 0.0057, "step": 7800 }, { "epoch": 1.2685843568196509, "grad_norm": 0.04167677462100983, "learning_rate": 1.1549235078646843e-05, "loss": 0.0056, "step": 7850 }, { "epoch": 1.2766645119586295, "grad_norm": 0.04780175909399986, "learning_rate": 1.149536737772032e-05, "loss": 0.0056, "step": 7900 }, { "epoch": 1.2847446670976082, "grad_norm": 0.06335488706827164, "learning_rate": 1.1441499676793795e-05, "loss": 0.0055, "step": 7950 }, { "epoch": 1.292824822236587, "grad_norm": 0.04902550205588341, "learning_rate": 1.138763197586727e-05, "loss": 0.0062, "step": 8000 }, { "epoch": 1.3009049773755657, "grad_norm": 0.0339086689054966, "learning_rate": 1.1333764274940747e-05, "loss": 0.0049, "step": 8050 }, { "epoch": 1.3089851325145443, "grad_norm": 0.044165465980768204, "learning_rate": 1.1279896574014222e-05, "loss": 0.0058, "step": 8100 }, { "epoch": 1.317065287653523, "grad_norm": 0.037159230560064316, "learning_rate": 1.1226028873087698e-05, "loss": 0.0053, "step": 8150 }, { "epoch": 1.3251454427925016, "grad_norm": 0.03820132464170456, "learning_rate": 1.1172161172161174e-05, "loss": 0.0053, "step": 8200 }, { "epoch": 1.3332255979314802, "grad_norm": 0.024112703278660774, "learning_rate": 1.111829347123465e-05, "loss": 0.0054, "step": 8250 }, { "epoch": 1.341305753070459, "grad_norm": 0.029049362987279892, "learning_rate": 1.1064425770308126e-05, "loss": 0.005, "step": 8300 }, { "epoch": 1.3493859082094377, "grad_norm": 0.0295393168926239, "learning_rate": 1.1010558069381601e-05, "loss": 0.0053, "step": 8350 }, { "epoch": 1.3574660633484164, "grad_norm": 0.04328558221459389, "learning_rate": 1.0956690368455075e-05, "loss": 0.005, "step": 8400 }, { "epoch": 1.365546218487395, "grad_norm": 0.058262139558792114, "learning_rate": 1.090282266752855e-05, "loss": 0.0053, "step": 8450 }, { "epoch": 1.3736263736263736, "grad_norm": 0.033369556069374084, "learning_rate": 1.0848954966602025e-05, "loss": 0.0058, "step": 8500 }, { "epoch": 1.3817065287653523, "grad_norm": 0.07817134261131287, "learning_rate": 1.0795087265675501e-05, "loss": 0.0053, "step": 8550 }, { "epoch": 1.389786683904331, "grad_norm": 0.0364365428686142, "learning_rate": 1.0741219564748977e-05, "loss": 0.0049, "step": 8600 }, { "epoch": 1.3978668390433096, "grad_norm": 0.047975849360227585, "learning_rate": 1.0687351863822453e-05, "loss": 0.0054, "step": 8650 }, { "epoch": 1.4059469941822882, "grad_norm": 0.05053960904479027, "learning_rate": 1.0633484162895929e-05, "loss": 0.0049, "step": 8700 }, { "epoch": 1.4140271493212668, "grad_norm": 0.05517440661787987, "learning_rate": 1.0579616461969404e-05, "loss": 0.0053, "step": 8750 }, { "epoch": 1.4221073044602457, "grad_norm": 0.045634444802999496, "learning_rate": 1.052574876104288e-05, "loss": 0.0051, "step": 8800 }, { "epoch": 1.4301874595992243, "grad_norm": 0.06119120493531227, "learning_rate": 1.0471881060116356e-05, "loss": 0.005, "step": 8850 }, { "epoch": 1.438267614738203, "grad_norm": 0.037451136857271194, "learning_rate": 1.0418013359189832e-05, "loss": 0.005, "step": 8900 }, { "epoch": 1.4463477698771816, "grad_norm": 0.045001059770584106, "learning_rate": 1.0364145658263308e-05, "loss": 0.0055, "step": 8950 }, { "epoch": 1.4544279250161603, "grad_norm": 0.027605222538113594, "learning_rate": 1.031027795733678e-05, "loss": 0.0047, "step": 9000 }, { "epoch": 1.4625080801551391, "grad_norm": 0.04454193264245987, "learning_rate": 1.0256410256410256e-05, "loss": 0.0054, "step": 9050 }, { "epoch": 1.4705882352941178, "grad_norm": 0.023829152807593346, "learning_rate": 1.0202542555483732e-05, "loss": 0.0051, "step": 9100 }, { "epoch": 1.4786683904330964, "grad_norm": 0.04157187044620514, "learning_rate": 1.0148674854557207e-05, "loss": 0.0047, "step": 9150 }, { "epoch": 1.486748545572075, "grad_norm": 0.03874518349766731, "learning_rate": 1.0094807153630683e-05, "loss": 0.0053, "step": 9200 }, { "epoch": 1.4948287007110537, "grad_norm": 0.026905696839094162, "learning_rate": 1.0040939452704159e-05, "loss": 0.005, "step": 9250 }, { "epoch": 1.5029088558500323, "grad_norm": 0.06382673233747482, "learning_rate": 9.987071751777635e-06, "loss": 0.005, "step": 9300 }, { "epoch": 1.510989010989011, "grad_norm": 0.02449849620461464, "learning_rate": 9.93320405085111e-06, "loss": 0.0053, "step": 9350 }, { "epoch": 1.5190691661279896, "grad_norm": 0.04602253809571266, "learning_rate": 9.879336349924586e-06, "loss": 0.0058, "step": 9400 }, { "epoch": 1.5271493212669682, "grad_norm": 0.026318348944187164, "learning_rate": 9.82546864899806e-06, "loss": 0.005, "step": 9450 }, { "epoch": 1.5352294764059469, "grad_norm": 0.028364863246679306, "learning_rate": 9.771600948071536e-06, "loss": 0.0047, "step": 9500 }, { "epoch": 1.5433096315449255, "grad_norm": 0.03204864263534546, "learning_rate": 9.717733247145012e-06, "loss": 0.0046, "step": 9550 }, { "epoch": 1.5513897866839044, "grad_norm": 0.023740077391266823, "learning_rate": 9.663865546218488e-06, "loss": 0.0057, "step": 9600 }, { "epoch": 1.559469941822883, "grad_norm": 0.028953751549124718, "learning_rate": 9.609997845291964e-06, "loss": 0.0049, "step": 9650 }, { "epoch": 1.5675500969618616, "grad_norm": 0.039458662271499634, "learning_rate": 9.55613014436544e-06, "loss": 0.0052, "step": 9700 }, { "epoch": 1.5756302521008403, "grad_norm": 0.02466548979282379, "learning_rate": 9.502262443438914e-06, "loss": 0.0053, "step": 9750 }, { "epoch": 1.5837104072398192, "grad_norm": 0.09884428232908249, "learning_rate": 9.44839474251239e-06, "loss": 0.0045, "step": 9800 }, { "epoch": 1.5917905623787978, "grad_norm": 0.052319470793008804, "learning_rate": 9.394527041585865e-06, "loss": 0.0047, "step": 9850 }, { "epoch": 1.5998707175177764, "grad_norm": 0.04534400627017021, "learning_rate": 9.340659340659341e-06, "loss": 0.0042, "step": 9900 }, { "epoch": 1.607950872656755, "grad_norm": 0.031061217188835144, "learning_rate": 9.286791639732817e-06, "loss": 0.0053, "step": 9950 }, { "epoch": 1.6160310277957337, "grad_norm": 0.04152417555451393, "learning_rate": 9.232923938806293e-06, "loss": 0.0044, "step": 10000 }, { "epoch": 1.6241111829347123, "grad_norm": 0.06364952027797699, "learning_rate": 9.179056237879767e-06, "loss": 0.0051, "step": 10050 }, { "epoch": 1.632191338073691, "grad_norm": 0.05075689032673836, "learning_rate": 9.125188536953243e-06, "loss": 0.005, "step": 10100 }, { "epoch": 1.6402714932126696, "grad_norm": 0.05718667805194855, "learning_rate": 9.071320836026718e-06, "loss": 0.005, "step": 10150 }, { "epoch": 1.6483516483516483, "grad_norm": 0.05176397040486336, "learning_rate": 9.017453135100194e-06, "loss": 0.005, "step": 10200 }, { "epoch": 1.656431803490627, "grad_norm": 0.05737081915140152, "learning_rate": 8.96358543417367e-06, "loss": 0.0054, "step": 10250 }, { "epoch": 1.6645119586296055, "grad_norm": 0.02792266197502613, "learning_rate": 8.909717733247146e-06, "loss": 0.0044, "step": 10300 }, { "epoch": 1.6725921137685844, "grad_norm": 0.029061343520879745, "learning_rate": 8.855850032320622e-06, "loss": 0.0052, "step": 10350 }, { "epoch": 1.680672268907563, "grad_norm": 0.03089820221066475, "learning_rate": 8.801982331394097e-06, "loss": 0.005, "step": 10400 }, { "epoch": 1.6887524240465417, "grad_norm": 0.046603500843048096, "learning_rate": 8.748114630467572e-06, "loss": 0.006, "step": 10450 }, { "epoch": 1.6968325791855203, "grad_norm": 0.033478520810604095, "learning_rate": 8.694246929541047e-06, "loss": 0.0045, "step": 10500 }, { "epoch": 1.7049127343244992, "grad_norm": 0.045349959284067154, "learning_rate": 8.640379228614523e-06, "loss": 0.0043, "step": 10550 }, { "epoch": 1.7129928894634778, "grad_norm": 0.04198373854160309, "learning_rate": 8.586511527687999e-06, "loss": 0.0046, "step": 10600 }, { "epoch": 1.7210730446024565, "grad_norm": 0.030365299433469772, "learning_rate": 8.532643826761475e-06, "loss": 0.0047, "step": 10650 }, { "epoch": 1.729153199741435, "grad_norm": 0.053815387189388275, "learning_rate": 8.47877612583495e-06, "loss": 0.0053, "step": 10700 }, { "epoch": 1.7372333548804137, "grad_norm": 0.040949735790491104, "learning_rate": 8.424908424908426e-06, "loss": 0.0049, "step": 10750 }, { "epoch": 1.7453135100193924, "grad_norm": 0.02742874063551426, "learning_rate": 8.371040723981902e-06, "loss": 0.0049, "step": 10800 }, { "epoch": 1.753393665158371, "grad_norm": 0.06784753501415253, "learning_rate": 8.317173023055376e-06, "loss": 0.0058, "step": 10850 }, { "epoch": 1.7614738202973497, "grad_norm": 0.03332887962460518, "learning_rate": 8.263305322128852e-06, "loss": 0.0046, "step": 10900 }, { "epoch": 1.7695539754363283, "grad_norm": 0.031116908416152, "learning_rate": 8.209437621202328e-06, "loss": 0.0046, "step": 10950 }, { "epoch": 1.777634130575307, "grad_norm": 0.03276669979095459, "learning_rate": 8.155569920275804e-06, "loss": 0.0045, "step": 11000 }, { "epoch": 1.7857142857142856, "grad_norm": 0.04121123626828194, "learning_rate": 8.10170221934928e-06, "loss": 0.0057, "step": 11050 }, { "epoch": 1.7937944408532642, "grad_norm": 0.05228004604578018, "learning_rate": 8.047834518422755e-06, "loss": 0.005, "step": 11100 }, { "epoch": 1.801874595992243, "grad_norm": 0.03658825531601906, "learning_rate": 7.993966817496231e-06, "loss": 0.0048, "step": 11150 }, { "epoch": 1.8099547511312217, "grad_norm": 0.029890988022089005, "learning_rate": 7.940099116569705e-06, "loss": 0.0045, "step": 11200 }, { "epoch": 1.8180349062702004, "grad_norm": 0.057183150202035904, "learning_rate": 7.886231415643181e-06, "loss": 0.0046, "step": 11250 }, { "epoch": 1.826115061409179, "grad_norm": 0.032826654613018036, "learning_rate": 7.832363714716657e-06, "loss": 0.0044, "step": 11300 }, { "epoch": 1.8341952165481579, "grad_norm": 0.08467372506856918, "learning_rate": 7.778496013790133e-06, "loss": 0.0048, "step": 11350 }, { "epoch": 1.8422753716871365, "grad_norm": 0.03478661924600601, "learning_rate": 7.724628312863608e-06, "loss": 0.005, "step": 11400 }, { "epoch": 1.8503555268261151, "grad_norm": 0.05386711657047272, "learning_rate": 7.670760611937084e-06, "loss": 0.0048, "step": 11450 }, { "epoch": 1.8584356819650938, "grad_norm": 0.03354140743613243, "learning_rate": 7.616892911010558e-06, "loss": 0.0044, "step": 11500 }, { "epoch": 1.8665158371040724, "grad_norm": 0.043414924293756485, "learning_rate": 7.563025210084034e-06, "loss": 0.0055, "step": 11550 }, { "epoch": 1.874595992243051, "grad_norm": 0.040471382439136505, "learning_rate": 7.50915750915751e-06, "loss": 0.0052, "step": 11600 }, { "epoch": 1.8826761473820297, "grad_norm": 0.06200335547327995, "learning_rate": 7.455289808230986e-06, "loss": 0.0053, "step": 11650 }, { "epoch": 1.8907563025210083, "grad_norm": 0.04366638883948326, "learning_rate": 7.4014221073044616e-06, "loss": 0.0046, "step": 11700 }, { "epoch": 1.898836457659987, "grad_norm": 0.022821014747023582, "learning_rate": 7.3475544063779365e-06, "loss": 0.0045, "step": 11750 }, { "epoch": 1.9069166127989656, "grad_norm": 0.038873497396707535, "learning_rate": 7.2936867054514115e-06, "loss": 0.0046, "step": 11800 }, { "epoch": 1.9149967679379443, "grad_norm": 0.04890233278274536, "learning_rate": 7.239819004524887e-06, "loss": 0.0057, "step": 11850 }, { "epoch": 1.9230769230769231, "grad_norm": 0.04092305526137352, "learning_rate": 7.185951303598363e-06, "loss": 0.005, "step": 11900 }, { "epoch": 1.9311570782159018, "grad_norm": 0.05939971283078194, "learning_rate": 7.132083602671839e-06, "loss": 0.0046, "step": 11950 }, { "epoch": 1.9392372333548804, "grad_norm": 0.037901926785707474, "learning_rate": 7.078215901745315e-06, "loss": 0.0045, "step": 12000 }, { "epoch": 1.947317388493859, "grad_norm": 0.0488058477640152, "learning_rate": 7.024348200818789e-06, "loss": 0.005, "step": 12050 }, { "epoch": 1.955397543632838, "grad_norm": 0.06671205163002014, "learning_rate": 6.970480499892265e-06, "loss": 0.0053, "step": 12100 }, { "epoch": 1.9634776987718165, "grad_norm": 0.03580876812338829, "learning_rate": 6.91661279896574e-06, "loss": 0.0048, "step": 12150 }, { "epoch": 1.9715578539107952, "grad_norm": 0.05258365720510483, "learning_rate": 6.862745098039216e-06, "loss": 0.0047, "step": 12200 }, { "epoch": 1.9796380090497738, "grad_norm": 0.035823095589876175, "learning_rate": 6.808877397112692e-06, "loss": 0.0053, "step": 12250 }, { "epoch": 1.9877181641887525, "grad_norm": 0.03747252747416496, "learning_rate": 6.755009696186168e-06, "loss": 0.0052, "step": 12300 }, { "epoch": 1.995798319327731, "grad_norm": 0.029116889461874962, "learning_rate": 6.701141995259643e-06, "loss": 0.0045, "step": 12350 }, { "epoch": 2.0, "eval_loss": 0.003754823002964258, "eval_rouge1": 87.21845608272304, "eval_rouge2": 83.41556886218162, "eval_rougeL": 85.9862609239223, "eval_rougeLsum": 85.95382052592191, "eval_runtime": 48.1802, "eval_samples_per_second": 10.378, "eval_steps_per_second": 1.308, "step": 12376 }, { "epoch": 2.0038784744667097, "grad_norm": 0.040494147688150406, "learning_rate": 6.647274294333118e-06, "loss": 0.0051, "step": 12400 }, { "epoch": 2.0119586296056884, "grad_norm": 0.021118978038430214, "learning_rate": 6.5934065934065935e-06, "loss": 0.0044, "step": 12450 }, { "epoch": 2.020038784744667, "grad_norm": 0.03655942156910896, "learning_rate": 6.539538892480069e-06, "loss": 0.0046, "step": 12500 }, { "epoch": 2.0281189398836457, "grad_norm": 0.033596962690353394, "learning_rate": 6.485671191553545e-06, "loss": 0.0049, "step": 12550 }, { "epoch": 2.0361990950226243, "grad_norm": 0.04082285612821579, "learning_rate": 6.431803490627021e-06, "loss": 0.0046, "step": 12600 }, { "epoch": 2.044279250161603, "grad_norm": 0.035328153520822525, "learning_rate": 6.377935789700496e-06, "loss": 0.0047, "step": 12650 }, { "epoch": 2.0523594053005816, "grad_norm": 0.06532145291566849, "learning_rate": 6.324068088773972e-06, "loss": 0.0046, "step": 12700 }, { "epoch": 2.0604395604395602, "grad_norm": 0.021467560902237892, "learning_rate": 6.2702003878474475e-06, "loss": 0.0044, "step": 12750 }, { "epoch": 2.0685197155785393, "grad_norm": 0.028903750702738762, "learning_rate": 6.2163326869209225e-06, "loss": 0.0047, "step": 12800 }, { "epoch": 2.076599870717518, "grad_norm": 0.04827199503779411, "learning_rate": 6.162464985994398e-06, "loss": 0.0046, "step": 12850 }, { "epoch": 2.0846800258564966, "grad_norm": 0.05190654471516609, "learning_rate": 6.108597285067874e-06, "loss": 0.0044, "step": 12900 }, { "epoch": 2.0927601809954752, "grad_norm": 0.043597374111413956, "learning_rate": 6.054729584141349e-06, "loss": 0.0039, "step": 12950 }, { "epoch": 2.100840336134454, "grad_norm": 0.025249801576137543, "learning_rate": 6.000861883214825e-06, "loss": 0.0049, "step": 13000 }, { "epoch": 2.1089204912734325, "grad_norm": 0.04429004713892937, "learning_rate": 5.946994182288301e-06, "loss": 0.0045, "step": 13050 }, { "epoch": 2.117000646412411, "grad_norm": 0.05018601194024086, "learning_rate": 5.8931264813617764e-06, "loss": 0.0047, "step": 13100 }, { "epoch": 2.12508080155139, "grad_norm": 0.0563606433570385, "learning_rate": 5.839258780435252e-06, "loss": 0.0049, "step": 13150 }, { "epoch": 2.1331609566903684, "grad_norm": 0.036379583179950714, "learning_rate": 5.785391079508727e-06, "loss": 0.0045, "step": 13200 }, { "epoch": 2.141241111829347, "grad_norm": 0.03966435417532921, "learning_rate": 5.731523378582202e-06, "loss": 0.0046, "step": 13250 }, { "epoch": 2.1493212669683257, "grad_norm": 0.03525066375732422, "learning_rate": 5.677655677655678e-06, "loss": 0.004, "step": 13300 }, { "epoch": 2.1574014221073043, "grad_norm": 0.04028931260108948, "learning_rate": 5.623787976729154e-06, "loss": 0.0045, "step": 13350 }, { "epoch": 2.165481577246283, "grad_norm": 0.027697792276740074, "learning_rate": 5.5699202758026296e-06, "loss": 0.0049, "step": 13400 }, { "epoch": 2.1735617323852616, "grad_norm": 0.02208828553557396, "learning_rate": 5.516052574876105e-06, "loss": 0.0044, "step": 13450 }, { "epoch": 2.1816418875242403, "grad_norm": 0.03327793627977371, "learning_rate": 5.4621848739495795e-06, "loss": 0.0052, "step": 13500 }, { "epoch": 2.1897220426632193, "grad_norm": 0.040238089859485626, "learning_rate": 5.408317173023055e-06, "loss": 0.0044, "step": 13550 }, { "epoch": 2.197802197802198, "grad_norm": 0.02015913464128971, "learning_rate": 5.354449472096531e-06, "loss": 0.0049, "step": 13600 }, { "epoch": 2.2058823529411766, "grad_norm": 0.018138017505407333, "learning_rate": 5.300581771170007e-06, "loss": 0.0048, "step": 13650 }, { "epoch": 2.2139625080801553, "grad_norm": 0.050266802310943604, "learning_rate": 5.246714070243483e-06, "loss": 0.0043, "step": 13700 }, { "epoch": 2.222042663219134, "grad_norm": 0.04515570029616356, "learning_rate": 5.1928463693169585e-06, "loss": 0.0036, "step": 13750 }, { "epoch": 2.2301228183581125, "grad_norm": 0.039447471499443054, "learning_rate": 5.138978668390433e-06, "loss": 0.0044, "step": 13800 }, { "epoch": 2.238202973497091, "grad_norm": 0.03310653567314148, "learning_rate": 5.085110967463908e-06, "loss": 0.0044, "step": 13850 }, { "epoch": 2.24628312863607, "grad_norm": 0.014222861267626286, "learning_rate": 5.031243266537384e-06, "loss": 0.0047, "step": 13900 }, { "epoch": 2.2543632837750485, "grad_norm": 0.016546355560421944, "learning_rate": 4.97737556561086e-06, "loss": 0.0047, "step": 13950 }, { "epoch": 2.262443438914027, "grad_norm": 0.05192546918988228, "learning_rate": 4.923507864684336e-06, "loss": 0.0043, "step": 14000 }, { "epoch": 2.2705235940530057, "grad_norm": 0.04964493215084076, "learning_rate": 4.869640163757811e-06, "loss": 0.0047, "step": 14050 }, { "epoch": 2.2786037491919844, "grad_norm": 0.01690448634326458, "learning_rate": 4.8157724628312866e-06, "loss": 0.0043, "step": 14100 }, { "epoch": 2.286683904330963, "grad_norm": 0.03871385008096695, "learning_rate": 4.761904761904762e-06, "loss": 0.0054, "step": 14150 }, { "epoch": 2.2947640594699417, "grad_norm": 0.029631048440933228, "learning_rate": 4.708037060978238e-06, "loss": 0.0042, "step": 14200 }, { "epoch": 2.3028442146089203, "grad_norm": 0.031350765377283096, "learning_rate": 4.654169360051713e-06, "loss": 0.0045, "step": 14250 }, { "epoch": 2.310924369747899, "grad_norm": 0.05931168794631958, "learning_rate": 4.600301659125189e-06, "loss": 0.0051, "step": 14300 }, { "epoch": 2.3190045248868776, "grad_norm": 0.04504646733403206, "learning_rate": 4.546433958198665e-06, "loss": 0.0043, "step": 14350 }, { "epoch": 2.3270846800258567, "grad_norm": 0.04591371491551399, "learning_rate": 4.4925662572721405e-06, "loss": 0.0046, "step": 14400 }, { "epoch": 2.3351648351648353, "grad_norm": 0.030743196606636047, "learning_rate": 4.4386985563456155e-06, "loss": 0.004, "step": 14450 }, { "epoch": 2.343244990303814, "grad_norm": 0.04723818972706795, "learning_rate": 4.384830855419091e-06, "loss": 0.0036, "step": 14500 }, { "epoch": 2.3513251454427926, "grad_norm": 0.02913173846900463, "learning_rate": 4.330963154492567e-06, "loss": 0.0049, "step": 14550 }, { "epoch": 2.3594053005817712, "grad_norm": 0.025342822074890137, "learning_rate": 4.277095453566042e-06, "loss": 0.0052, "step": 14600 }, { "epoch": 2.36748545572075, "grad_norm": 0.04897177964448929, "learning_rate": 4.223227752639518e-06, "loss": 0.0046, "step": 14650 }, { "epoch": 2.3755656108597285, "grad_norm": 0.05571192875504494, "learning_rate": 4.169360051712994e-06, "loss": 0.0045, "step": 14700 }, { "epoch": 2.383645765998707, "grad_norm": 0.04304831102490425, "learning_rate": 4.115492350786469e-06, "loss": 0.0045, "step": 14750 }, { "epoch": 2.391725921137686, "grad_norm": 0.04351252317428589, "learning_rate": 4.0616246498599444e-06, "loss": 0.0046, "step": 14800 }, { "epoch": 2.3998060762766644, "grad_norm": 0.07481127232313156, "learning_rate": 4.00775694893342e-06, "loss": 0.0044, "step": 14850 }, { "epoch": 2.407886231415643, "grad_norm": 0.06555662304162979, "learning_rate": 3.953889248006895e-06, "loss": 0.0046, "step": 14900 }, { "epoch": 2.4159663865546217, "grad_norm": 0.03455435112118721, "learning_rate": 3.900021547080371e-06, "loss": 0.0047, "step": 14950 }, { "epoch": 2.4240465416936003, "grad_norm": 0.0476142056286335, "learning_rate": 3.846153846153847e-06, "loss": 0.0046, "step": 15000 }, { "epoch": 2.4321266968325794, "grad_norm": 0.07432708144187927, "learning_rate": 3.7922861452273218e-06, "loss": 0.0044, "step": 15050 }, { "epoch": 2.440206851971558, "grad_norm": 0.018862377852201462, "learning_rate": 3.7384184443007976e-06, "loss": 0.0042, "step": 15100 }, { "epoch": 2.4482870071105367, "grad_norm": 0.0463634617626667, "learning_rate": 3.6845507433742734e-06, "loss": 0.0051, "step": 15150 }, { "epoch": 2.4563671622495153, "grad_norm": 0.0470375195145607, "learning_rate": 3.6306830424477483e-06, "loss": 0.0046, "step": 15200 }, { "epoch": 2.464447317388494, "grad_norm": 0.028038673102855682, "learning_rate": 3.576815341521224e-06, "loss": 0.0045, "step": 15250 }, { "epoch": 2.4725274725274726, "grad_norm": 0.03303570672869682, "learning_rate": 3.5229476405947e-06, "loss": 0.0048, "step": 15300 }, { "epoch": 2.4806076276664513, "grad_norm": 0.040639448910951614, "learning_rate": 3.4690799396681753e-06, "loss": 0.0045, "step": 15350 }, { "epoch": 2.48868778280543, "grad_norm": 0.03696522116661072, "learning_rate": 3.4152122387416507e-06, "loss": 0.0047, "step": 15400 }, { "epoch": 2.4967679379444085, "grad_norm": 0.06647258996963501, "learning_rate": 3.3613445378151265e-06, "loss": 0.0046, "step": 15450 }, { "epoch": 2.504848093083387, "grad_norm": 0.039001937955617905, "learning_rate": 3.307476836888602e-06, "loss": 0.0052, "step": 15500 }, { "epoch": 2.512928248222366, "grad_norm": 0.02684204652905464, "learning_rate": 3.2536091359620777e-06, "loss": 0.0043, "step": 15550 }, { "epoch": 2.5210084033613445, "grad_norm": 0.04314061999320984, "learning_rate": 3.1997414350355526e-06, "loss": 0.0047, "step": 15600 }, { "epoch": 2.529088558500323, "grad_norm": 0.04125179722905159, "learning_rate": 3.1458737341090284e-06, "loss": 0.0046, "step": 15650 }, { "epoch": 2.5371687136393017, "grad_norm": 0.01851014606654644, "learning_rate": 3.0920060331825042e-06, "loss": 0.0045, "step": 15700 }, { "epoch": 2.5452488687782804, "grad_norm": 0.03928118199110031, "learning_rate": 3.038138332255979e-06, "loss": 0.0043, "step": 15750 }, { "epoch": 2.553329023917259, "grad_norm": 0.0691729187965393, "learning_rate": 2.984270631329455e-06, "loss": 0.005, "step": 15800 }, { "epoch": 2.5614091790562377, "grad_norm": 0.016138378530740738, "learning_rate": 2.930402930402931e-06, "loss": 0.0041, "step": 15850 }, { "epoch": 2.5694893341952163, "grad_norm": 0.020654356107115746, "learning_rate": 2.8765352294764057e-06, "loss": 0.004, "step": 15900 }, { "epoch": 2.577569489334195, "grad_norm": 0.04678305238485336, "learning_rate": 2.8226675285498815e-06, "loss": 0.0045, "step": 15950 }, { "epoch": 2.585649644473174, "grad_norm": 0.035098664462566376, "learning_rate": 2.7687998276233574e-06, "loss": 0.0049, "step": 16000 }, { "epoch": 2.5937297996121527, "grad_norm": 0.02151743695139885, "learning_rate": 2.7149321266968327e-06, "loss": 0.0049, "step": 16050 }, { "epoch": 2.6018099547511313, "grad_norm": 0.008306082338094711, "learning_rate": 2.6610644257703085e-06, "loss": 0.0043, "step": 16100 }, { "epoch": 2.60989010989011, "grad_norm": 0.03021089918911457, "learning_rate": 2.607196724843784e-06, "loss": 0.0042, "step": 16150 }, { "epoch": 2.6179702650290886, "grad_norm": 0.032146211713552475, "learning_rate": 2.5533290239172593e-06, "loss": 0.0041, "step": 16200 }, { "epoch": 2.6260504201680672, "grad_norm": 0.030957765877246857, "learning_rate": 2.499461322990735e-06, "loss": 0.0041, "step": 16250 }, { "epoch": 2.634130575307046, "grad_norm": 0.04607204720377922, "learning_rate": 2.4455936220642105e-06, "loss": 0.0051, "step": 16300 }, { "epoch": 2.6422107304460245, "grad_norm": 0.02674623765051365, "learning_rate": 2.3917259211376863e-06, "loss": 0.0044, "step": 16350 }, { "epoch": 2.650290885585003, "grad_norm": 0.016672629863023758, "learning_rate": 2.3378582202111617e-06, "loss": 0.0036, "step": 16400 }, { "epoch": 2.658371040723982, "grad_norm": 0.03345870599150658, "learning_rate": 2.283990519284637e-06, "loss": 0.0042, "step": 16450 }, { "epoch": 2.6664511958629604, "grad_norm": 0.03514959663152695, "learning_rate": 2.230122818358113e-06, "loss": 0.0049, "step": 16500 }, { "epoch": 2.6745313510019395, "grad_norm": 0.03409695625305176, "learning_rate": 2.1762551174315882e-06, "loss": 0.0044, "step": 16550 }, { "epoch": 2.682611506140918, "grad_norm": 0.021505704149603844, "learning_rate": 2.1223874165050636e-06, "loss": 0.0039, "step": 16600 }, { "epoch": 2.690691661279897, "grad_norm": 0.142180934548378, "learning_rate": 2.068519715578539e-06, "loss": 0.0049, "step": 16650 }, { "epoch": 2.6987718164188754, "grad_norm": 0.05118989571928978, "learning_rate": 2.0146520146520148e-06, "loss": 0.0044, "step": 16700 }, { "epoch": 2.706851971557854, "grad_norm": 0.03433993086218834, "learning_rate": 1.96078431372549e-06, "loss": 0.0042, "step": 16750 }, { "epoch": 2.7149321266968327, "grad_norm": 0.05495297163724899, "learning_rate": 1.9069166127989658e-06, "loss": 0.0044, "step": 16800 }, { "epoch": 2.7230122818358113, "grad_norm": 0.03106830269098282, "learning_rate": 1.8530489118724416e-06, "loss": 0.0042, "step": 16850 }, { "epoch": 2.73109243697479, "grad_norm": 0.05960262566804886, "learning_rate": 1.799181210945917e-06, "loss": 0.004, "step": 16900 }, { "epoch": 2.7391725921137686, "grad_norm": 0.0477134995162487, "learning_rate": 1.7453135100193923e-06, "loss": 0.0045, "step": 16950 }, { "epoch": 2.7472527472527473, "grad_norm": 0.04596434533596039, "learning_rate": 1.6925231631113985e-06, "loss": 0.0051, "step": 17000 }, { "epoch": 2.755332902391726, "grad_norm": 0.02121753990650177, "learning_rate": 1.6386554621848741e-06, "loss": 0.0046, "step": 17050 }, { "epoch": 2.7634130575307045, "grad_norm": 0.04902210831642151, "learning_rate": 1.5847877612583495e-06, "loss": 0.005, "step": 17100 }, { "epoch": 2.771493212669683, "grad_norm": 0.03566383942961693, "learning_rate": 1.5309200603318253e-06, "loss": 0.0039, "step": 17150 }, { "epoch": 2.779573367808662, "grad_norm": 0.03547670692205429, "learning_rate": 1.4770523594053007e-06, "loss": 0.0045, "step": 17200 }, { "epoch": 2.7876535229476405, "grad_norm": 0.03021397814154625, "learning_rate": 1.423184658478776e-06, "loss": 0.0043, "step": 17250 }, { "epoch": 2.795733678086619, "grad_norm": 0.03953844681382179, "learning_rate": 1.3693169575522519e-06, "loss": 0.0039, "step": 17300 }, { "epoch": 2.8038138332255977, "grad_norm": 0.03536642715334892, "learning_rate": 1.3154492566257273e-06, "loss": 0.0041, "step": 17350 }, { "epoch": 2.8118939883645764, "grad_norm": 0.03634963929653168, "learning_rate": 1.2615815556992029e-06, "loss": 0.005, "step": 17400 }, { "epoch": 2.819974143503555, "grad_norm": 0.032898109406232834, "learning_rate": 1.2077138547726784e-06, "loss": 0.004, "step": 17450 }, { "epoch": 2.8280542986425337, "grad_norm": 0.05191650986671448, "learning_rate": 1.153846153846154e-06, "loss": 0.0046, "step": 17500 }, { "epoch": 2.8361344537815127, "grad_norm": 0.02623840421438217, "learning_rate": 1.0999784529196294e-06, "loss": 0.0042, "step": 17550 }, { "epoch": 2.8442146089204914, "grad_norm": 0.05044868588447571, "learning_rate": 1.046110751993105e-06, "loss": 0.0044, "step": 17600 }, { "epoch": 2.85229476405947, "grad_norm": 0.12745575606822968, "learning_rate": 9.922430510665806e-07, "loss": 0.0046, "step": 17650 }, { "epoch": 2.8603749191984487, "grad_norm": 0.04590355604887009, "learning_rate": 9.383753501400561e-07, "loss": 0.0041, "step": 17700 }, { "epoch": 2.8684550743374273, "grad_norm": 0.027806632220745087, "learning_rate": 8.845076492135317e-07, "loss": 0.0049, "step": 17750 }, { "epoch": 2.876535229476406, "grad_norm": 0.047230854630470276, "learning_rate": 8.306399482870072e-07, "loss": 0.0044, "step": 17800 }, { "epoch": 2.8846153846153846, "grad_norm": 0.023929964751005173, "learning_rate": 7.767722473604828e-07, "loss": 0.0041, "step": 17850 }, { "epoch": 2.892695539754363, "grad_norm": 0.01972135528922081, "learning_rate": 7.229045464339583e-07, "loss": 0.0038, "step": 17900 }, { "epoch": 2.900775694893342, "grad_norm": 0.06653473526239395, "learning_rate": 6.690368455074338e-07, "loss": 0.0043, "step": 17950 }, { "epoch": 2.9088558500323205, "grad_norm": 0.03229772299528122, "learning_rate": 6.151691445809093e-07, "loss": 0.0045, "step": 18000 }, { "epoch": 2.916936005171299, "grad_norm": 0.027306850999593735, "learning_rate": 5.613014436543849e-07, "loss": 0.0044, "step": 18050 }, { "epoch": 2.9250161603102782, "grad_norm": 0.04258955270051956, "learning_rate": 5.074337427278604e-07, "loss": 0.0044, "step": 18100 }, { "epoch": 2.933096315449257, "grad_norm": 0.05304836109280586, "learning_rate": 4.5356604180133593e-07, "loss": 0.0042, "step": 18150 }, { "epoch": 2.9411764705882355, "grad_norm": 0.03534824028611183, "learning_rate": 3.9969834087481147e-07, "loss": 0.0044, "step": 18200 }, { "epoch": 2.949256625727214, "grad_norm": 0.04045730456709862, "learning_rate": 3.4583063994828706e-07, "loss": 0.0046, "step": 18250 }, { "epoch": 2.957336780866193, "grad_norm": 0.022856054827570915, "learning_rate": 2.919629390217626e-07, "loss": 0.0046, "step": 18300 }, { "epoch": 2.9654169360051714, "grad_norm": 0.04489414766430855, "learning_rate": 2.3809523809523811e-07, "loss": 0.0041, "step": 18350 }, { "epoch": 2.97349709114415, "grad_norm": 0.022563565522432327, "learning_rate": 1.8422753716871365e-07, "loss": 0.0045, "step": 18400 }, { "epoch": 2.9815772462831287, "grad_norm": 0.05544481799006462, "learning_rate": 1.303598362421892e-07, "loss": 0.0042, "step": 18450 }, { "epoch": 2.9896574014221073, "grad_norm": 0.03954526409506798, "learning_rate": 7.649213531566473e-08, "loss": 0.0045, "step": 18500 }, { "epoch": 2.997737556561086, "grad_norm": 0.10584747046232224, "learning_rate": 2.2624434389140274e-08, "loss": 0.0045, "step": 18550 } ], "logging_steps": 50, "max_steps": 18564, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.043024674816e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }