german-babylm-mix-bpe / trainer_state.json
bbunzeck's picture
Upload 11 files
09a4db5 verified
{
"best_metric": 4.381021022796631,
"best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/mix-bpe/checkpoint-3906",
"epoch": 0.9998557456507313,
"eval_steps": 434,
"global_step": 4332,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002308069588298087,
"grad_norm": 1.293995976448059,
"learning_rate": 1.4999999999999999e-05,
"loss": 8.3864,
"step": 10
},
{
"epoch": 0.004616139176596174,
"grad_norm": 1.6057629585266113,
"learning_rate": 2.9999999999999997e-05,
"loss": 8.3534,
"step": 20
},
{
"epoch": 0.006924208764894262,
"grad_norm": 1.8857852220535278,
"learning_rate": 4.4999999999999996e-05,
"loss": 8.2432,
"step": 30
},
{
"epoch": 0.009232278353192349,
"grad_norm": 1.6119285821914673,
"learning_rate": 5.9999999999999995e-05,
"loss": 8.0835,
"step": 40
},
{
"epoch": 0.011540347941490435,
"grad_norm": 1.474076747894287,
"learning_rate": 7.5e-05,
"loss": 7.8521,
"step": 50
},
{
"epoch": 0.013848417529788524,
"grad_norm": 1.3890337944030762,
"learning_rate": 8.999999999999999e-05,
"loss": 7.6704,
"step": 60
},
{
"epoch": 0.01615648711808661,
"grad_norm": 1.7293298244476318,
"learning_rate": 0.00010499999999999999,
"loss": 7.4797,
"step": 70
},
{
"epoch": 0.018464556706384697,
"grad_norm": 1.161170244216919,
"learning_rate": 0.00011999999999999999,
"loss": 7.2628,
"step": 80
},
{
"epoch": 0.020772626294682784,
"grad_norm": 1.2024428844451904,
"learning_rate": 0.000135,
"loss": 7.0311,
"step": 90
},
{
"epoch": 0.02308069588298087,
"grad_norm": 0.9382893443107605,
"learning_rate": 0.00015,
"loss": 6.79,
"step": 100
},
{
"epoch": 0.025388765471278957,
"grad_norm": 0.9623205661773682,
"learning_rate": 0.000165,
"loss": 6.6329,
"step": 110
},
{
"epoch": 0.027696835059577048,
"grad_norm": 0.5374640822410583,
"learning_rate": 0.00017999999999999998,
"loss": 6.4776,
"step": 120
},
{
"epoch": 0.030004904647875134,
"grad_norm": 0.37330469489097595,
"learning_rate": 0.000195,
"loss": 6.4591,
"step": 130
},
{
"epoch": 0.03231297423617322,
"grad_norm": 0.4222196340560913,
"learning_rate": 0.00020999999999999998,
"loss": 6.4463,
"step": 140
},
{
"epoch": 0.03462104382447131,
"grad_norm": 0.4244931638240814,
"learning_rate": 0.000225,
"loss": 6.4109,
"step": 150
},
{
"epoch": 0.036929113412769395,
"grad_norm": 0.5667627453804016,
"learning_rate": 0.00023999999999999998,
"loss": 6.3945,
"step": 160
},
{
"epoch": 0.03923718300106748,
"grad_norm": 0.4752316474914551,
"learning_rate": 0.00025499999999999996,
"loss": 6.3878,
"step": 170
},
{
"epoch": 0.04154525258936557,
"grad_norm": 0.5644646883010864,
"learning_rate": 0.00027,
"loss": 6.3093,
"step": 180
},
{
"epoch": 0.043853322177663655,
"grad_norm": 0.6428855657577515,
"learning_rate": 0.000285,
"loss": 6.3548,
"step": 190
},
{
"epoch": 0.04616139176596174,
"grad_norm": 0.8332350850105286,
"learning_rate": 0.0003,
"loss": 6.2497,
"step": 200
},
{
"epoch": 0.04846946135425983,
"grad_norm": 0.8160709142684937,
"learning_rate": 0.0002999956645089803,
"loss": 6.1748,
"step": 210
},
{
"epoch": 0.050777530942557915,
"grad_norm": 0.7665246725082397,
"learning_rate": 0.000299982658286541,
"loss": 6.1079,
"step": 220
},
{
"epoch": 0.05308560053085601,
"grad_norm": 0.7429030537605286,
"learning_rate": 0.00029996098208452687,
"loss": 6.032,
"step": 230
},
{
"epoch": 0.055393670119154095,
"grad_norm": 0.9278781414031982,
"learning_rate": 0.0002999306371559644,
"loss": 6.023,
"step": 240
},
{
"epoch": 0.05770173970745218,
"grad_norm": 0.7202680706977844,
"learning_rate": 0.00029989162525498905,
"loss": 5.9386,
"step": 250
},
{
"epoch": 0.06000980929575027,
"grad_norm": 0.7615482807159424,
"learning_rate": 0.000299843948636744,
"loss": 5.8888,
"step": 260
},
{
"epoch": 0.062317878884048356,
"grad_norm": 0.7727493643760681,
"learning_rate": 0.00029978761005725014,
"loss": 5.8483,
"step": 270
},
{
"epoch": 0.06462594847234644,
"grad_norm": 0.8060325980186462,
"learning_rate": 0.0002997226127732461,
"loss": 5.8168,
"step": 280
},
{
"epoch": 0.06693401806064453,
"grad_norm": 0.801364541053772,
"learning_rate": 0.0002996489605420004,
"loss": 5.7915,
"step": 290
},
{
"epoch": 0.06924208764894262,
"grad_norm": 0.6761994957923889,
"learning_rate": 0.0002995666576210942,
"loss": 5.816,
"step": 300
},
{
"epoch": 0.0715501572372407,
"grad_norm": 0.854761004447937,
"learning_rate": 0.0002994757087681753,
"loss": 5.7237,
"step": 310
},
{
"epoch": 0.07385822682553879,
"grad_norm": 0.810724675655365,
"learning_rate": 0.0002993761192406826,
"loss": 5.6849,
"step": 320
},
{
"epoch": 0.07616629641383688,
"grad_norm": 0.7817623615264893,
"learning_rate": 0.000299267894795543,
"loss": 5.6347,
"step": 330
},
{
"epoch": 0.07847436600213496,
"grad_norm": 0.7851743698120117,
"learning_rate": 0.0002991510416888378,
"loss": 5.5995,
"step": 340
},
{
"epoch": 0.08078243559043305,
"grad_norm": 0.736893355846405,
"learning_rate": 0.0002990255666754418,
"loss": 5.6445,
"step": 350
},
{
"epoch": 0.08309050517873114,
"grad_norm": 0.8936936855316162,
"learning_rate": 0.00029889147700863205,
"loss": 5.6018,
"step": 360
},
{
"epoch": 0.08539857476702922,
"grad_norm": 0.852159857749939,
"learning_rate": 0.00029874878043966926,
"loss": 5.5471,
"step": 370
},
{
"epoch": 0.08770664435532731,
"grad_norm": 0.7617043256759644,
"learning_rate": 0.0002985974852173493,
"loss": 5.5397,
"step": 380
},
{
"epoch": 0.0900147139436254,
"grad_norm": 0.7751661539077759,
"learning_rate": 0.0002984376000875267,
"loss": 5.4445,
"step": 390
},
{
"epoch": 0.09232278353192348,
"grad_norm": 0.8484746813774109,
"learning_rate": 0.00029826913429260843,
"loss": 5.4171,
"step": 400
},
{
"epoch": 0.09463085312022157,
"grad_norm": 0.885892391204834,
"learning_rate": 0.0002980920975710206,
"loss": 5.4414,
"step": 410
},
{
"epoch": 0.09693892270851966,
"grad_norm": 0.8213350176811218,
"learning_rate": 0.0002979065001566447,
"loss": 5.4237,
"step": 420
},
{
"epoch": 0.09924699229681774,
"grad_norm": 0.839364767074585,
"learning_rate": 0.00029771235277822633,
"loss": 5.4203,
"step": 430
},
{
"epoch": 0.10017022013213699,
"eval_loss": 5.607814311981201,
"eval_runtime": 38.9963,
"eval_samples_per_second": 641.087,
"eval_steps_per_second": 80.136,
"step": 434
},
{
"epoch": 0.10155506188511583,
"grad_norm": 0.8963534832000732,
"learning_rate": 0.0002975096666587551,
"loss": 5.3826,
"step": 440
},
{
"epoch": 0.10386313147341392,
"grad_norm": 0.9185658097267151,
"learning_rate": 0.0002972984535148157,
"loss": 5.3868,
"step": 450
},
{
"epoch": 0.10617120106171202,
"grad_norm": 0.9081022143363953,
"learning_rate": 0.0002970787255559106,
"loss": 5.4027,
"step": 460
},
{
"epoch": 0.1084792706500101,
"grad_norm": 0.8252591490745544,
"learning_rate": 0.00029685049548375426,
"loss": 5.3417,
"step": 470
},
{
"epoch": 0.11078734023830819,
"grad_norm": 0.8946505784988403,
"learning_rate": 0.0002966137764915393,
"loss": 5.2916,
"step": 480
},
{
"epoch": 0.11309540982660628,
"grad_norm": 0.8276567459106445,
"learning_rate": 0.00029636858226317304,
"loss": 5.2734,
"step": 490
},
{
"epoch": 0.11540347941490436,
"grad_norm": 0.8609122037887573,
"learning_rate": 0.00029611492697248726,
"loss": 5.3293,
"step": 500
},
{
"epoch": 0.11771154900320245,
"grad_norm": 0.9473890662193298,
"learning_rate": 0.0002958528252824184,
"loss": 5.3,
"step": 510
},
{
"epoch": 0.12001961859150054,
"grad_norm": 0.8542040586471558,
"learning_rate": 0.0002955822923441601,
"loss": 5.2721,
"step": 520
},
{
"epoch": 0.12232768817979862,
"grad_norm": 0.8030778169631958,
"learning_rate": 0.00029530334379628735,
"loss": 5.2909,
"step": 530
},
{
"epoch": 0.12463575776809671,
"grad_norm": 0.8569799065589905,
"learning_rate": 0.0002950159957638525,
"loss": 5.2491,
"step": 540
},
{
"epoch": 0.12694382735639478,
"grad_norm": 0.8883563280105591,
"learning_rate": 0.00029472026485745297,
"loss": 5.1771,
"step": 550
},
{
"epoch": 0.12925189694469288,
"grad_norm": 0.9277822971343994,
"learning_rate": 0.00029441616817227145,
"loss": 5.1982,
"step": 560
},
{
"epoch": 0.13155996653299096,
"grad_norm": 0.888548731803894,
"learning_rate": 0.0002941037232870871,
"loss": 5.2259,
"step": 570
},
{
"epoch": 0.13386803612128906,
"grad_norm": 0.8762586712837219,
"learning_rate": 0.00029378294826325993,
"loss": 5.2699,
"step": 580
},
{
"epoch": 0.13617610570958713,
"grad_norm": 1.1397993564605713,
"learning_rate": 0.0002934538616436863,
"loss": 5.2092,
"step": 590
},
{
"epoch": 0.13848417529788523,
"grad_norm": 0.8557673096656799,
"learning_rate": 0.0002931164824517275,
"loss": 5.2417,
"step": 600
},
{
"epoch": 0.14079224488618333,
"grad_norm": 0.8277891874313354,
"learning_rate": 0.00029277083019010945,
"loss": 5.16,
"step": 610
},
{
"epoch": 0.1431003144744814,
"grad_norm": 0.8815522193908691,
"learning_rate": 0.00029241692483979593,
"loss": 5.1709,
"step": 620
},
{
"epoch": 0.1454083840627795,
"grad_norm": 0.9206939339637756,
"learning_rate": 0.0002920547868588331,
"loss": 5.1484,
"step": 630
},
{
"epoch": 0.14771645365107758,
"grad_norm": 0.8649567365646362,
"learning_rate": 0.00029168443718116725,
"loss": 5.1292,
"step": 640
},
{
"epoch": 0.15002452323937568,
"grad_norm": 0.8811323642730713,
"learning_rate": 0.00029130589721543433,
"loss": 5.1518,
"step": 650
},
{
"epoch": 0.15233259282767375,
"grad_norm": 0.8579047918319702,
"learning_rate": 0.0002909191888437227,
"loss": 5.0781,
"step": 660
},
{
"epoch": 0.15464066241597185,
"grad_norm": 0.9432843923568726,
"learning_rate": 0.00029052433442030797,
"loss": 5.1191,
"step": 670
},
{
"epoch": 0.15694873200426993,
"grad_norm": 0.8163111805915833,
"learning_rate": 0.00029012135677036077,
"loss": 5.0546,
"step": 680
},
{
"epoch": 0.15925680159256803,
"grad_norm": 0.8901731967926025,
"learning_rate": 0.00028971027918862777,
"loss": 5.0731,
"step": 690
},
{
"epoch": 0.1615648711808661,
"grad_norm": 0.9626258015632629,
"learning_rate": 0.00028929112543808435,
"loss": 5.0357,
"step": 700
},
{
"epoch": 0.1638729407691642,
"grad_norm": 0.898350179195404,
"learning_rate": 0.0002888639197485614,
"loss": 5.1027,
"step": 710
},
{
"epoch": 0.16618101035746227,
"grad_norm": 0.9431145787239075,
"learning_rate": 0.00028842868681534486,
"loss": 4.9996,
"step": 720
},
{
"epoch": 0.16848907994576037,
"grad_norm": 0.8631531596183777,
"learning_rate": 0.0002879854517977475,
"loss": 5.0532,
"step": 730
},
{
"epoch": 0.17079714953405845,
"grad_norm": 0.9095075726509094,
"learning_rate": 0.0002875342403176553,
"loss": 5.0117,
"step": 740
},
{
"epoch": 0.17310521912235655,
"grad_norm": 0.8849276304244995,
"learning_rate": 0.00028707507845804575,
"loss": 4.997,
"step": 750
},
{
"epoch": 0.17541328871065462,
"grad_norm": 1.015526533126831,
"learning_rate": 0.00028660799276148053,
"loss": 5.017,
"step": 760
},
{
"epoch": 0.17772135829895272,
"grad_norm": 0.9242818355560303,
"learning_rate": 0.00028613301022857086,
"loss": 5.0325,
"step": 770
},
{
"epoch": 0.1800294278872508,
"grad_norm": 0.871295154094696,
"learning_rate": 0.0002856501583164168,
"loss": 5.0235,
"step": 780
},
{
"epoch": 0.1823374974755489,
"grad_norm": 0.9293336868286133,
"learning_rate": 0.0002851594649370201,
"loss": 5.0311,
"step": 790
},
{
"epoch": 0.18464556706384697,
"grad_norm": 0.8775876760482788,
"learning_rate": 0.00028466095845567057,
"loss": 5.0069,
"step": 800
},
{
"epoch": 0.18695363665214507,
"grad_norm": 0.9968228340148926,
"learning_rate": 0.0002841546676893065,
"loss": 4.9595,
"step": 810
},
{
"epoch": 0.18926170624044314,
"grad_norm": 0.9142509698867798,
"learning_rate": 0.000283640621904849,
"loss": 4.9601,
"step": 820
},
{
"epoch": 0.19156977582874124,
"grad_norm": 0.863057017326355,
"learning_rate": 0.0002831188508175096,
"loss": 4.9811,
"step": 830
},
{
"epoch": 0.1938778454170393,
"grad_norm": 0.8651947379112244,
"learning_rate": 0.00028258938458907334,
"loss": 4.9288,
"step": 840
},
{
"epoch": 0.1961859150053374,
"grad_norm": 0.8541522026062012,
"learning_rate": 0.0002820522538261545,
"loss": 4.9037,
"step": 850
},
{
"epoch": 0.1984939845936355,
"grad_norm": 0.9082189202308655,
"learning_rate": 0.0002815074895784278,
"loss": 4.8757,
"step": 860
},
{
"epoch": 0.20034044026427397,
"eval_loss": 5.134793758392334,
"eval_runtime": 39.0568,
"eval_samples_per_second": 640.093,
"eval_steps_per_second": 80.012,
"step": 868
},
{
"epoch": 0.2008020541819336,
"grad_norm": 0.8799365162849426,
"learning_rate": 0.0002809551233368332,
"loss": 4.942,
"step": 870
},
{
"epoch": 0.20311012377023166,
"grad_norm": 0.9927550554275513,
"learning_rate": 0.00028039518703175577,
"loss": 4.8919,
"step": 880
},
{
"epoch": 0.20541819335852976,
"grad_norm": 0.9068810343742371,
"learning_rate": 0.00027982771303117996,
"loss": 4.9315,
"step": 890
},
{
"epoch": 0.20772626294682783,
"grad_norm": 0.975292980670929,
"learning_rate": 0.000279252734138818,
"loss": 4.919,
"step": 900
},
{
"epoch": 0.21003433253512593,
"grad_norm": 0.8984159231185913,
"learning_rate": 0.0002786702835922144,
"loss": 4.8989,
"step": 910
},
{
"epoch": 0.21234240212342403,
"grad_norm": 0.9731834530830383,
"learning_rate": 0.0002780803950608239,
"loss": 4.8991,
"step": 920
},
{
"epoch": 0.2146504717117221,
"grad_norm": 0.932558536529541,
"learning_rate": 0.00027748310264406564,
"loss": 4.8866,
"step": 930
},
{
"epoch": 0.2169585413000202,
"grad_norm": 0.9369585514068604,
"learning_rate": 0.00027687844086935176,
"loss": 4.8829,
"step": 940
},
{
"epoch": 0.21926661088831828,
"grad_norm": 0.9317886829376221,
"learning_rate": 0.0002762664446900914,
"loss": 4.8399,
"step": 950
},
{
"epoch": 0.22157468047661638,
"grad_norm": 0.9692897796630859,
"learning_rate": 0.00027564714948367046,
"loss": 4.8339,
"step": 960
},
{
"epoch": 0.22388275006491445,
"grad_norm": 0.912087082862854,
"learning_rate": 0.0002750205910494064,
"loss": 4.8176,
"step": 970
},
{
"epoch": 0.22619081965321255,
"grad_norm": 0.9403560757637024,
"learning_rate": 0.00027438680560647877,
"loss": 4.8227,
"step": 980
},
{
"epoch": 0.22849888924151063,
"grad_norm": 0.9240433573722839,
"learning_rate": 0.0002737458297918355,
"loss": 4.8053,
"step": 990
},
{
"epoch": 0.23080695882980873,
"grad_norm": 0.9296719431877136,
"learning_rate": 0.000273097700658075,
"loss": 4.8571,
"step": 1000
},
{
"epoch": 0.2331150284181068,
"grad_norm": 0.9312313199043274,
"learning_rate": 0.0002724424556713046,
"loss": 4.8177,
"step": 1010
},
{
"epoch": 0.2354230980064049,
"grad_norm": 0.9134889245033264,
"learning_rate": 0.0002717801327089743,
"loss": 4.7824,
"step": 1020
},
{
"epoch": 0.23773116759470297,
"grad_norm": 0.8830430507659912,
"learning_rate": 0.0002711107700576875,
"loss": 4.7996,
"step": 1030
},
{
"epoch": 0.24003923718300108,
"grad_norm": 0.9481594562530518,
"learning_rate": 0.00027043440641098777,
"loss": 4.8118,
"step": 1040
},
{
"epoch": 0.24234730677129915,
"grad_norm": 0.9204795956611633,
"learning_rate": 0.0002697510808671219,
"loss": 4.7847,
"step": 1050
},
{
"epoch": 0.24465537635959725,
"grad_norm": 0.9537863731384277,
"learning_rate": 0.0002690608329267801,
"loss": 4.79,
"step": 1060
},
{
"epoch": 0.24696344594789532,
"grad_norm": 0.9290711283683777,
"learning_rate": 0.00026836370249081235,
"loss": 4.7671,
"step": 1070
},
{
"epoch": 0.24927151553619342,
"grad_norm": 0.9401620626449585,
"learning_rate": 0.00026765972985792183,
"loss": 4.7715,
"step": 1080
},
{
"epoch": 0.2515795851244915,
"grad_norm": 0.9478754997253418,
"learning_rate": 0.00026694895572233556,
"loss": 4.8047,
"step": 1090
},
{
"epoch": 0.25388765471278957,
"grad_norm": 0.9853310585021973,
"learning_rate": 0.000266231421171452,
"loss": 4.744,
"step": 1100
},
{
"epoch": 0.25619572430108767,
"grad_norm": 0.9713465571403503,
"learning_rate": 0.0002655071676834659,
"loss": 4.7124,
"step": 1110
},
{
"epoch": 0.25850379388938577,
"grad_norm": 0.9828191995620728,
"learning_rate": 0.00026477623712497047,
"loss": 4.7588,
"step": 1120
},
{
"epoch": 0.26081186347768387,
"grad_norm": 0.9578918814659119,
"learning_rate": 0.0002640386717485373,
"loss": 4.7603,
"step": 1130
},
{
"epoch": 0.2631199330659819,
"grad_norm": 0.9244778156280518,
"learning_rate": 0.0002632945141902739,
"loss": 4.7823,
"step": 1140
},
{
"epoch": 0.26542800265428,
"grad_norm": 1.0172864198684692,
"learning_rate": 0.00026254380746735926,
"loss": 4.7744,
"step": 1150
},
{
"epoch": 0.2677360722425781,
"grad_norm": 0.9416136741638184,
"learning_rate": 0.00026178659497555663,
"loss": 4.7059,
"step": 1160
},
{
"epoch": 0.2700441418308762,
"grad_norm": 0.8836077451705933,
"learning_rate": 0.0002610229204867055,
"loss": 4.7969,
"step": 1170
},
{
"epoch": 0.27235221141917426,
"grad_norm": 0.9215325713157654,
"learning_rate": 0.000260252828146191,
"loss": 4.7436,
"step": 1180
},
{
"epoch": 0.27466028100747236,
"grad_norm": 0.9629151225090027,
"learning_rate": 0.0002594763624703922,
"loss": 4.7368,
"step": 1190
},
{
"epoch": 0.27696835059577046,
"grad_norm": 1.0467829704284668,
"learning_rate": 0.00025869356834410864,
"loss": 4.6909,
"step": 1200
},
{
"epoch": 0.27927642018406856,
"grad_norm": 0.9704943299293518,
"learning_rate": 0.00025790449101796575,
"loss": 4.6959,
"step": 1210
},
{
"epoch": 0.28158448977236666,
"grad_norm": 0.9856391549110413,
"learning_rate": 0.0002571091761057989,
"loss": 4.6771,
"step": 1220
},
{
"epoch": 0.2838925593606647,
"grad_norm": 1.040661096572876,
"learning_rate": 0.00025630766958201695,
"loss": 4.7199,
"step": 1230
},
{
"epoch": 0.2862006289489628,
"grad_norm": 0.9736753702163696,
"learning_rate": 0.0002555000177789444,
"loss": 4.6799,
"step": 1240
},
{
"epoch": 0.2885086985372609,
"grad_norm": 1.006039023399353,
"learning_rate": 0.00025468626738414305,
"loss": 4.694,
"step": 1250
},
{
"epoch": 0.290816768125559,
"grad_norm": 0.96723473072052,
"learning_rate": 0.0002538664654377134,
"loss": 4.6445,
"step": 1260
},
{
"epoch": 0.29312483771385706,
"grad_norm": 0.9628943800926208,
"learning_rate": 0.00025304065932957494,
"loss": 4.6465,
"step": 1270
},
{
"epoch": 0.29543290730215516,
"grad_norm": 0.9398277997970581,
"learning_rate": 0.00025220889679672745,
"loss": 4.6037,
"step": 1280
},
{
"epoch": 0.29774097689045326,
"grad_norm": 0.9146639108657837,
"learning_rate": 0.00025137122592049066,
"loss": 4.6605,
"step": 1290
},
{
"epoch": 0.30004904647875136,
"grad_norm": 0.9692845940589905,
"learning_rate": 0.0002505276951237254,
"loss": 4.6331,
"step": 1300
},
{
"epoch": 0.30051066039641094,
"eval_loss": 4.868031978607178,
"eval_runtime": 39.4278,
"eval_samples_per_second": 634.071,
"eval_steps_per_second": 79.259,
"step": 1302
},
{
"epoch": 0.3023571160670494,
"grad_norm": 0.9402655959129333,
"learning_rate": 0.00024967835316803434,
"loss": 4.6622,
"step": 1310
},
{
"epoch": 0.3046651856553475,
"grad_norm": 0.9427255392074585,
"learning_rate": 0.00024882324915094305,
"loss": 4.6359,
"step": 1320
},
{
"epoch": 0.3069732552436456,
"grad_norm": 0.9967660307884216,
"learning_rate": 0.00024796243250306196,
"loss": 4.6153,
"step": 1330
},
{
"epoch": 0.3092813248319437,
"grad_norm": 0.9784890413284302,
"learning_rate": 0.00024709595298522916,
"loss": 4.6401,
"step": 1340
},
{
"epoch": 0.31158939442024175,
"grad_norm": 1.011221170425415,
"learning_rate": 0.00024622386068563344,
"loss": 4.5711,
"step": 1350
},
{
"epoch": 0.31389746400853985,
"grad_norm": 1.0162895917892456,
"learning_rate": 0.0002453462060169193,
"loss": 4.6102,
"step": 1360
},
{
"epoch": 0.31620553359683795,
"grad_norm": 0.9559857845306396,
"learning_rate": 0.00024446303971327254,
"loss": 4.6215,
"step": 1370
},
{
"epoch": 0.31851360318513605,
"grad_norm": 0.9640511870384216,
"learning_rate": 0.00024357441282748756,
"loss": 4.6299,
"step": 1380
},
{
"epoch": 0.3208216727734341,
"grad_norm": 0.990332841873169,
"learning_rate": 0.00024268037672801605,
"loss": 4.6633,
"step": 1390
},
{
"epoch": 0.3231297423617322,
"grad_norm": 0.971305787563324,
"learning_rate": 0.00024178098309599782,
"loss": 4.6453,
"step": 1400
},
{
"epoch": 0.3254378119500303,
"grad_norm": 0.9675345420837402,
"learning_rate": 0.00024087628392227304,
"loss": 4.562,
"step": 1410
},
{
"epoch": 0.3277458815383284,
"grad_norm": 0.9208952188491821,
"learning_rate": 0.000239966331504377,
"loss": 4.576,
"step": 1420
},
{
"epoch": 0.33005395112662644,
"grad_norm": 1.004249930381775,
"learning_rate": 0.00023905117844351674,
"loss": 4.6263,
"step": 1430
},
{
"epoch": 0.33236202071492454,
"grad_norm": 0.9654126763343811,
"learning_rate": 0.0002381308776415307,
"loss": 4.6431,
"step": 1440
},
{
"epoch": 0.33467009030322264,
"grad_norm": 0.9294122457504272,
"learning_rate": 0.0002372054822978304,
"loss": 4.5542,
"step": 1450
},
{
"epoch": 0.33697815989152075,
"grad_norm": 0.8988949060440063,
"learning_rate": 0.00023627504590632517,
"loss": 4.6312,
"step": 1460
},
{
"epoch": 0.3392862294798188,
"grad_norm": 1.0338096618652344,
"learning_rate": 0.00023533962225232992,
"loss": 4.5963,
"step": 1470
},
{
"epoch": 0.3415942990681169,
"grad_norm": 0.9440902471542358,
"learning_rate": 0.00023439926540945604,
"loss": 4.5587,
"step": 1480
},
{
"epoch": 0.343902368656415,
"grad_norm": 1.0818684101104736,
"learning_rate": 0.00023345402973648548,
"loss": 4.5462,
"step": 1490
},
{
"epoch": 0.3462104382447131,
"grad_norm": 0.9467376470565796,
"learning_rate": 0.00023250396987422857,
"loss": 4.5969,
"step": 1500
},
{
"epoch": 0.3485185078330112,
"grad_norm": 1.009598970413208,
"learning_rate": 0.00023154914074236522,
"loss": 4.5773,
"step": 1510
},
{
"epoch": 0.35082657742130924,
"grad_norm": 1.053017020225525,
"learning_rate": 0.00023058959753627056,
"loss": 4.6051,
"step": 1520
},
{
"epoch": 0.35313464700960734,
"grad_norm": 0.969485342502594,
"learning_rate": 0.0002296253957238239,
"loss": 4.604,
"step": 1530
},
{
"epoch": 0.35544271659790544,
"grad_norm": 1.2008713483810425,
"learning_rate": 0.00022865659104220255,
"loss": 4.5704,
"step": 1540
},
{
"epoch": 0.35775078618620354,
"grad_norm": 1.0302430391311646,
"learning_rate": 0.00022768323949465987,
"loss": 4.5391,
"step": 1550
},
{
"epoch": 0.3600588557745016,
"grad_norm": 0.966468870639801,
"learning_rate": 0.0002267053973472877,
"loss": 4.5363,
"step": 1560
},
{
"epoch": 0.3623669253627997,
"grad_norm": 0.9762535095214844,
"learning_rate": 0.00022572312112576406,
"loss": 4.5587,
"step": 1570
},
{
"epoch": 0.3646749949510978,
"grad_norm": 1.0306516885757446,
"learning_rate": 0.0002247364676120855,
"loss": 4.5296,
"step": 1580
},
{
"epoch": 0.3669830645393959,
"grad_norm": 0.9838928580284119,
"learning_rate": 0.00022374549384128456,
"loss": 4.5947,
"step": 1590
},
{
"epoch": 0.36929113412769393,
"grad_norm": 0.9782153964042664,
"learning_rate": 0.0002227502570981331,
"loss": 4.5091,
"step": 1600
},
{
"epoch": 0.37159920371599203,
"grad_norm": 1.0444267988204956,
"learning_rate": 0.00022175081491383048,
"loss": 4.5221,
"step": 1610
},
{
"epoch": 0.37390727330429013,
"grad_norm": 1.0300132036209106,
"learning_rate": 0.00022074722506267846,
"loss": 4.5312,
"step": 1620
},
{
"epoch": 0.37621534289258823,
"grad_norm": 1.0193167924880981,
"learning_rate": 0.00021973954555874067,
"loss": 4.5706,
"step": 1630
},
{
"epoch": 0.3785234124808863,
"grad_norm": 1.005703091621399,
"learning_rate": 0.00021872783465248978,
"loss": 4.5149,
"step": 1640
},
{
"epoch": 0.3808314820691844,
"grad_norm": 1.0483267307281494,
"learning_rate": 0.00021771215082743968,
"loss": 4.4801,
"step": 1650
},
{
"epoch": 0.3831395516574825,
"grad_norm": 0.9708661437034607,
"learning_rate": 0.00021669255279676514,
"loss": 4.5121,
"step": 1660
},
{
"epoch": 0.3854476212457806,
"grad_norm": 1.0603435039520264,
"learning_rate": 0.00021566909949990746,
"loss": 4.4826,
"step": 1670
},
{
"epoch": 0.3877556908340786,
"grad_norm": 1.1404428482055664,
"learning_rate": 0.0002146418500991678,
"loss": 4.4787,
"step": 1680
},
{
"epoch": 0.3900637604223767,
"grad_norm": 1.0174285173416138,
"learning_rate": 0.00021361086397628682,
"loss": 4.5004,
"step": 1690
},
{
"epoch": 0.3923718300106748,
"grad_norm": 1.1664884090423584,
"learning_rate": 0.0002125762007290121,
"loss": 4.5197,
"step": 1700
},
{
"epoch": 0.3946798995989729,
"grad_norm": 0.9837216138839722,
"learning_rate": 0.00021153792016765334,
"loss": 4.5019,
"step": 1710
},
{
"epoch": 0.396987969187271,
"grad_norm": 1.0171154737472534,
"learning_rate": 0.00021049608231162454,
"loss": 4.514,
"step": 1720
},
{
"epoch": 0.3992960387755691,
"grad_norm": 1.0302062034606934,
"learning_rate": 0.00020945074738597447,
"loss": 4.5388,
"step": 1730
},
{
"epoch": 0.40068088052854794,
"eval_loss": 4.705195426940918,
"eval_runtime": 39.444,
"eval_samples_per_second": 633.81,
"eval_steps_per_second": 79.226,
"step": 1736
},
{
"epoch": 0.4016041083638672,
"grad_norm": 1.0344492197036743,
"learning_rate": 0.00020840197581790569,
"loss": 4.454,
"step": 1740
},
{
"epoch": 0.4039121779521653,
"grad_norm": 1.0174837112426758,
"learning_rate": 0.00020734982823328104,
"loss": 4.4651,
"step": 1750
},
{
"epoch": 0.4062202475404633,
"grad_norm": 0.9471856355667114,
"learning_rate": 0.00020629436545311928,
"loss": 4.5174,
"step": 1760
},
{
"epoch": 0.4085283171287614,
"grad_norm": 1.0585298538208008,
"learning_rate": 0.00020523564849007906,
"loss": 4.4544,
"step": 1770
},
{
"epoch": 0.4108363867170595,
"grad_norm": 1.0396827459335327,
"learning_rate": 0.00020417373854493228,
"loss": 4.5077,
"step": 1780
},
{
"epoch": 0.4131444563053576,
"grad_norm": 1.0221539735794067,
"learning_rate": 0.0002031086970030259,
"loss": 4.4515,
"step": 1790
},
{
"epoch": 0.41545252589365567,
"grad_norm": 0.9722737669944763,
"learning_rate": 0.00020204058543073393,
"loss": 4.4483,
"step": 1800
},
{
"epoch": 0.41776059548195377,
"grad_norm": 1.0386008024215698,
"learning_rate": 0.00020096946557189802,
"loss": 4.5063,
"step": 1810
},
{
"epoch": 0.42006866507025187,
"grad_norm": 1.1091365814208984,
"learning_rate": 0.00019989539934425857,
"loss": 4.4913,
"step": 1820
},
{
"epoch": 0.42237673465854997,
"grad_norm": 1.083609700202942,
"learning_rate": 0.0001988184488358754,
"loss": 4.4873,
"step": 1830
},
{
"epoch": 0.42468480424684807,
"grad_norm": 1.084915280342102,
"learning_rate": 0.00019773867630153857,
"loss": 4.4625,
"step": 1840
},
{
"epoch": 0.4269928738351461,
"grad_norm": 0.9942842125892639,
"learning_rate": 0.00019665614415916979,
"loss": 4.435,
"step": 1850
},
{
"epoch": 0.4293009434234442,
"grad_norm": 1.0121150016784668,
"learning_rate": 0.00019557091498621416,
"loss": 4.4056,
"step": 1860
},
{
"epoch": 0.4316090130117423,
"grad_norm": 1.0252583026885986,
"learning_rate": 0.00019448305151602272,
"loss": 4.3947,
"step": 1870
},
{
"epoch": 0.4339170826000404,
"grad_norm": 1.09006667137146,
"learning_rate": 0.00019339261663422629,
"loss": 4.4671,
"step": 1880
},
{
"epoch": 0.43622515218833846,
"grad_norm": 1.0912604331970215,
"learning_rate": 0.00019229967337510003,
"loss": 4.3903,
"step": 1890
},
{
"epoch": 0.43853322177663656,
"grad_norm": 1.0326118469238281,
"learning_rate": 0.00019120428491791974,
"loss": 4.4382,
"step": 1900
},
{
"epoch": 0.44084129136493466,
"grad_norm": 1.0649503469467163,
"learning_rate": 0.00019010651458330964,
"loss": 4.3955,
"step": 1910
},
{
"epoch": 0.44314936095323276,
"grad_norm": 1.0922999382019043,
"learning_rate": 0.00018900642582958213,
"loss": 4.4406,
"step": 1920
},
{
"epoch": 0.4454574305415308,
"grad_norm": 1.0487009286880493,
"learning_rate": 0.0001879040822490693,
"loss": 4.4296,
"step": 1930
},
{
"epoch": 0.4477655001298289,
"grad_norm": 0.9958457946777344,
"learning_rate": 0.00018679954756444723,
"loss": 4.46,
"step": 1940
},
{
"epoch": 0.450073569718127,
"grad_norm": 1.0039042234420776,
"learning_rate": 0.00018569288562505183,
"loss": 4.4473,
"step": 1950
},
{
"epoch": 0.4523816393064251,
"grad_norm": 1.1141057014465332,
"learning_rate": 0.00018458416040318857,
"loss": 4.4023,
"step": 1960
},
{
"epoch": 0.45468970889472315,
"grad_norm": 1.052263855934143,
"learning_rate": 0.00018347343599043388,
"loss": 4.4455,
"step": 1970
},
{
"epoch": 0.45699777848302126,
"grad_norm": 0.942537784576416,
"learning_rate": 0.00018236077659393077,
"loss": 4.466,
"step": 1980
},
{
"epoch": 0.45930584807131936,
"grad_norm": 1.0139743089675903,
"learning_rate": 0.00018124624653267682,
"loss": 4.4551,
"step": 1990
},
{
"epoch": 0.46161391765961746,
"grad_norm": 1.0442676544189453,
"learning_rate": 0.0001801299102338063,
"loss": 4.4184,
"step": 2000
},
{
"epoch": 0.4639219872479155,
"grad_norm": 1.1058298349380493,
"learning_rate": 0.00017901183222886592,
"loss": 4.4478,
"step": 2010
},
{
"epoch": 0.4662300568362136,
"grad_norm": 1.1228169202804565,
"learning_rate": 0.00017789207715008428,
"loss": 4.3777,
"step": 2020
},
{
"epoch": 0.4685381264245117,
"grad_norm": 1.022026777267456,
"learning_rate": 0.0001767707097266359,
"loss": 4.4119,
"step": 2030
},
{
"epoch": 0.4708461960128098,
"grad_norm": 1.0868556499481201,
"learning_rate": 0.0001756477947808994,
"loss": 4.3989,
"step": 2040
},
{
"epoch": 0.47315426560110785,
"grad_norm": 1.0714977979660034,
"learning_rate": 0.00017452339722471026,
"loss": 4.4166,
"step": 2050
},
{
"epoch": 0.47546233518940595,
"grad_norm": 1.0716434717178345,
"learning_rate": 0.0001733975820556086,
"loss": 4.3757,
"step": 2060
},
{
"epoch": 0.47777040477770405,
"grad_norm": 1.0657131671905518,
"learning_rate": 0.00017227041435308177,
"loss": 4.3756,
"step": 2070
},
{
"epoch": 0.48007847436600215,
"grad_norm": 1.0671368837356567,
"learning_rate": 0.00017114195927480256,
"loss": 4.3956,
"step": 2080
},
{
"epoch": 0.4823865439543002,
"grad_norm": 1.0677906274795532,
"learning_rate": 0.00017001228205286236,
"loss": 4.3989,
"step": 2090
},
{
"epoch": 0.4846946135425983,
"grad_norm": 1.0088515281677246,
"learning_rate": 0.00016888144799000047,
"loss": 4.4024,
"step": 2100
},
{
"epoch": 0.4870026831308964,
"grad_norm": 0.9815865755081177,
"learning_rate": 0.0001677495224558293,
"loss": 4.4096,
"step": 2110
},
{
"epoch": 0.4893107527191945,
"grad_norm": 1.1238495111465454,
"learning_rate": 0.00016661657088305526,
"loss": 4.3879,
"step": 2120
},
{
"epoch": 0.4916188223074926,
"grad_norm": 1.0731137990951538,
"learning_rate": 0.0001654826587636967,
"loss": 4.4087,
"step": 2130
},
{
"epoch": 0.49392689189579064,
"grad_norm": 1.1006789207458496,
"learning_rate": 0.0001643478516452977,
"loss": 4.3862,
"step": 2140
},
{
"epoch": 0.49623496148408874,
"grad_norm": 1.0866320133209229,
"learning_rate": 0.00016321221512713928,
"loss": 4.3835,
"step": 2150
},
{
"epoch": 0.49854303107238684,
"grad_norm": 1.1201952695846558,
"learning_rate": 0.00016207581485644707,
"loss": 4.3263,
"step": 2160
},
{
"epoch": 0.500851100660685,
"grad_norm": 1.0500922203063965,
"learning_rate": 0.0001609387165245966,
"loss": 4.3604,
"step": 2170
},
{
"epoch": 0.500851100660685,
"eval_loss": 4.576458930969238,
"eval_runtime": 39.2462,
"eval_samples_per_second": 637.005,
"eval_steps_per_second": 79.626,
"step": 2170
},
{
"epoch": 0.503159170248983,
"grad_norm": 1.1081622838974,
"learning_rate": 0.0001598009858633161,
"loss": 4.3842,
"step": 2180
},
{
"epoch": 0.5054672398372811,
"grad_norm": 1.0299861431121826,
"learning_rate": 0.00015866268864088626,
"loss": 4.435,
"step": 2190
},
{
"epoch": 0.5077753094255791,
"grad_norm": 1.0571188926696777,
"learning_rate": 0.00015752389065833898,
"loss": 4.3528,
"step": 2200
},
{
"epoch": 0.5100833790138772,
"grad_norm": 1.1348025798797607,
"learning_rate": 0.0001563846577456533,
"loss": 4.3108,
"step": 2210
},
{
"epoch": 0.5123914486021753,
"grad_norm": 1.0860552787780762,
"learning_rate": 0.00015524505575794997,
"loss": 4.3618,
"step": 2220
},
{
"epoch": 0.5146995181904734,
"grad_norm": 1.085950493812561,
"learning_rate": 0.0001541051505716849,
"loss": 4.3482,
"step": 2230
},
{
"epoch": 0.5170075877787715,
"grad_norm": 1.0809770822525024,
"learning_rate": 0.00015296500808084055,
"loss": 4.3486,
"step": 2240
},
{
"epoch": 0.5193156573670696,
"grad_norm": 1.1132404804229736,
"learning_rate": 0.00015182469419311754,
"loss": 4.311,
"step": 2250
},
{
"epoch": 0.5216237269553677,
"grad_norm": 1.073387861251831,
"learning_rate": 0.00015068427482612393,
"loss": 4.3567,
"step": 2260
},
{
"epoch": 0.5239317965436658,
"grad_norm": 1.0589022636413574,
"learning_rate": 0.0001495438159035655,
"loss": 4.3789,
"step": 2270
},
{
"epoch": 0.5262398661319638,
"grad_norm": 1.082677960395813,
"learning_rate": 0.00014840338335143452,
"loss": 4.3612,
"step": 2280
},
{
"epoch": 0.5285479357202619,
"grad_norm": 1.0973902940750122,
"learning_rate": 0.0001472630430941987,
"loss": 4.3735,
"step": 2290
},
{
"epoch": 0.53085600530856,
"grad_norm": 1.1433314085006714,
"learning_rate": 0.00014612286105099068,
"loss": 4.3271,
"step": 2300
},
{
"epoch": 0.5331640748968581,
"grad_norm": 1.0663039684295654,
"learning_rate": 0.00014498290313179725,
"loss": 4.3353,
"step": 2310
},
{
"epoch": 0.5354721444851562,
"grad_norm": 1.1327035427093506,
"learning_rate": 0.00014384323523364948,
"loss": 4.3811,
"step": 2320
},
{
"epoch": 0.5377802140734543,
"grad_norm": 1.102273941040039,
"learning_rate": 0.00014270392323681303,
"loss": 4.3814,
"step": 2330
},
{
"epoch": 0.5400882836617524,
"grad_norm": 1.1237622499465942,
"learning_rate": 0.00014156503300098038,
"loss": 4.34,
"step": 2340
},
{
"epoch": 0.5423963532500505,
"grad_norm": 1.1374157667160034,
"learning_rate": 0.00014042663036146344,
"loss": 4.3239,
"step": 2350
},
{
"epoch": 0.5447044228383485,
"grad_norm": 1.171049952507019,
"learning_rate": 0.0001392887811253878,
"loss": 4.3335,
"step": 2360
},
{
"epoch": 0.5470124924266466,
"grad_norm": 1.128770112991333,
"learning_rate": 0.00013815155106788865,
"loss": 4.3021,
"step": 2370
},
{
"epoch": 0.5493205620149447,
"grad_norm": 1.1492712497711182,
"learning_rate": 0.00013701500592830878,
"loss": 4.3139,
"step": 2380
},
{
"epoch": 0.5516286316032428,
"grad_norm": 1.1176902055740356,
"learning_rate": 0.00013587921140639805,
"loss": 4.3339,
"step": 2390
},
{
"epoch": 0.5539367011915409,
"grad_norm": 1.106078863143921,
"learning_rate": 0.00013474423315851586,
"loss": 4.314,
"step": 2400
},
{
"epoch": 0.556244770779839,
"grad_norm": 1.0579196214675903,
"learning_rate": 0.00013361013679383553,
"loss": 4.2973,
"step": 2410
},
{
"epoch": 0.5585528403681371,
"grad_norm": 1.0957796573638916,
"learning_rate": 0.0001324769878705518,
"loss": 4.2925,
"step": 2420
},
{
"epoch": 0.5608609099564352,
"grad_norm": 1.1303716897964478,
"learning_rate": 0.000131344851892091,
"loss": 4.338,
"step": 2430
},
{
"epoch": 0.5631689795447333,
"grad_norm": 1.1496975421905518,
"learning_rate": 0.0001302137943033249,
"loss": 4.3075,
"step": 2440
},
{
"epoch": 0.5654770491330313,
"grad_norm": 1.1256930828094482,
"learning_rate": 0.00012908388048678686,
"loss": 4.3234,
"step": 2450
},
{
"epoch": 0.5677851187213294,
"grad_norm": 1.1006488800048828,
"learning_rate": 0.00012795517575889303,
"loss": 4.311,
"step": 2460
},
{
"epoch": 0.5700931883096275,
"grad_norm": 1.223235845565796,
"learning_rate": 0.00012682774536616623,
"loss": 4.3056,
"step": 2470
},
{
"epoch": 0.5724012578979256,
"grad_norm": 1.2225327491760254,
"learning_rate": 0.00012570165448146447,
"loss": 4.3276,
"step": 2480
},
{
"epoch": 0.5747093274862237,
"grad_norm": 1.152992844581604,
"learning_rate": 0.00012457696820021314,
"loss": 4.3058,
"step": 2490
},
{
"epoch": 0.5770173970745218,
"grad_norm": 1.252081036567688,
"learning_rate": 0.00012345375153664264,
"loss": 4.2789,
"step": 2500
},
{
"epoch": 0.5793254666628199,
"grad_norm": 1.1459189653396606,
"learning_rate": 0.0001223320694200297,
"loss": 4.3181,
"step": 2510
},
{
"epoch": 0.581633536251118,
"grad_norm": 1.0899156332015991,
"learning_rate": 0.00012121198669094436,
"loss": 4.3692,
"step": 2520
},
{
"epoch": 0.583941605839416,
"grad_norm": 1.060966968536377,
"learning_rate": 0.00012009356809750131,
"loss": 4.3294,
"step": 2530
},
{
"epoch": 0.5862496754277141,
"grad_norm": 1.1345113515853882,
"learning_rate": 0.0001189768782916175,
"loss": 4.3261,
"step": 2540
},
{
"epoch": 0.5885577450160122,
"grad_norm": 1.1231807470321655,
"learning_rate": 0.00011786198182527461,
"loss": 4.3368,
"step": 2550
},
{
"epoch": 0.5908658146043103,
"grad_norm": 1.0552847385406494,
"learning_rate": 0.00011674894314678761,
"loss": 4.2938,
"step": 2560
},
{
"epoch": 0.5931738841926084,
"grad_norm": 1.1445595026016235,
"learning_rate": 0.00011563782659707897,
"loss": 4.3184,
"step": 2570
},
{
"epoch": 0.5954819537809065,
"grad_norm": 1.1536098718643188,
"learning_rate": 0.00011452869640595975,
"loss": 4.3189,
"step": 2580
},
{
"epoch": 0.5977900233692046,
"grad_norm": 1.2136541604995728,
"learning_rate": 0.00011342161668841641,
"loss": 4.2195,
"step": 2590
},
{
"epoch": 0.6000980929575027,
"grad_norm": 1.1163119077682495,
"learning_rate": 0.00011231665144090456,
"loss": 4.2419,
"step": 2600
},
{
"epoch": 0.6010213207928219,
"eval_loss": 4.489214897155762,
"eval_runtime": 39.3697,
"eval_samples_per_second": 635.006,
"eval_steps_per_second": 79.376,
"step": 2604
},
{
"epoch": 0.6024061625458007,
"grad_norm": 1.4087986946105957,
"learning_rate": 0.0001112138645376496,
"loss": 4.2601,
"step": 2610
},
{
"epoch": 0.6047142321340988,
"grad_norm": 1.19622802734375,
"learning_rate": 0.00011011331972695449,
"loss": 4.296,
"step": 2620
},
{
"epoch": 0.6070223017223969,
"grad_norm": 1.1068109273910522,
"learning_rate": 0.00010901508062751438,
"loss": 4.2879,
"step": 2630
},
{
"epoch": 0.609330371310695,
"grad_norm": 1.156851887702942,
"learning_rate": 0.00010791921072473941,
"loss": 4.2653,
"step": 2640
},
{
"epoch": 0.6116384408989931,
"grad_norm": 1.1451683044433594,
"learning_rate": 0.00010682577336708449,
"loss": 4.2987,
"step": 2650
},
{
"epoch": 0.6139465104872912,
"grad_norm": 1.133888840675354,
"learning_rate": 0.00010573483176238752,
"loss": 4.2558,
"step": 2660
},
{
"epoch": 0.6162545800755893,
"grad_norm": 1.1750303506851196,
"learning_rate": 0.00010464644897421561,
"loss": 4.3379,
"step": 2670
},
{
"epoch": 0.6185626496638874,
"grad_norm": 1.1152632236480713,
"learning_rate": 0.00010356068791821953,
"loss": 4.2346,
"step": 2680
},
{
"epoch": 0.6208707192521854,
"grad_norm": 1.167487382888794,
"learning_rate": 0.0001024776113584966,
"loss": 4.2805,
"step": 2690
},
{
"epoch": 0.6231787888404835,
"grad_norm": 1.1430394649505615,
"learning_rate": 0.00010139728190396288,
"loss": 4.2433,
"step": 2700
},
{
"epoch": 0.6254868584287816,
"grad_norm": 1.181851863861084,
"learning_rate": 0.00010031976200473364,
"loss": 4.2759,
"step": 2710
},
{
"epoch": 0.6277949280170797,
"grad_norm": 1.152464509010315,
"learning_rate": 9.92451139485136e-05,
"loss": 4.2761,
"step": 2720
},
{
"epoch": 0.6301029976053778,
"grad_norm": 1.1081931591033936,
"learning_rate": 9.817339985699593e-05,
"loss": 4.2457,
"step": 2730
},
{
"epoch": 0.6324110671936759,
"grad_norm": 1.1744070053100586,
"learning_rate": 9.710468168227158e-05,
"loss": 4.2863,
"step": 2740
},
{
"epoch": 0.634719136781974,
"grad_norm": 1.218344807624817,
"learning_rate": 9.60390212032479e-05,
"loss": 4.2711,
"step": 2750
},
{
"epoch": 0.6370272063702721,
"grad_norm": 1.1617342233657837,
"learning_rate": 9.497648002207745e-05,
"loss": 4.2289,
"step": 2760
},
{
"epoch": 0.6393352759585702,
"grad_norm": 1.1277827024459839,
"learning_rate": 9.391711956059675e-05,
"loss": 4.2894,
"step": 2770
},
{
"epoch": 0.6416433455468682,
"grad_norm": 1.2645858526229858,
"learning_rate": 9.286100105677608e-05,
"loss": 4.2934,
"step": 2780
},
{
"epoch": 0.6439514151351663,
"grad_norm": 1.1396396160125732,
"learning_rate": 9.180818556117931e-05,
"loss": 4.2627,
"step": 2790
},
{
"epoch": 0.6462594847234644,
"grad_norm": 1.1112399101257324,
"learning_rate": 9.075873393343487e-05,
"loss": 4.2799,
"step": 2800
},
{
"epoch": 0.6485675543117625,
"grad_norm": 1.1682491302490234,
"learning_rate": 8.971270683871736e-05,
"loss": 4.2557,
"step": 2810
},
{
"epoch": 0.6508756239000606,
"grad_norm": 1.1381292343139648,
"learning_rate": 8.867016474424121e-05,
"loss": 4.2468,
"step": 2820
},
{
"epoch": 0.6531836934883587,
"grad_norm": 1.2451766729354858,
"learning_rate": 8.763116791576497e-05,
"loss": 4.3402,
"step": 2830
},
{
"epoch": 0.6554917630766568,
"grad_norm": 1.2092978954315186,
"learning_rate": 8.659577641410756e-05,
"loss": 4.2999,
"step": 2840
},
{
"epoch": 0.6577998326649549,
"grad_norm": 1.1491693258285522,
"learning_rate": 8.556405009167627e-05,
"loss": 4.2427,
"step": 2850
},
{
"epoch": 0.6601079022532529,
"grad_norm": 1.166319489479065,
"learning_rate": 8.453604858900736e-05,
"loss": 4.2599,
"step": 2860
},
{
"epoch": 0.662415971841551,
"grad_norm": 1.1869049072265625,
"learning_rate": 8.351183133131778e-05,
"loss": 4.2849,
"step": 2870
},
{
"epoch": 0.6647240414298491,
"grad_norm": 1.1337544918060303,
"learning_rate": 8.24914575250707e-05,
"loss": 4.2321,
"step": 2880
},
{
"epoch": 0.6670321110181472,
"grad_norm": 1.1453369855880737,
"learning_rate": 8.147498615455221e-05,
"loss": 4.2508,
"step": 2890
},
{
"epoch": 0.6693401806064453,
"grad_norm": 1.2037177085876465,
"learning_rate": 8.046247597846244e-05,
"loss": 4.2616,
"step": 2900
},
{
"epoch": 0.6716482501947434,
"grad_norm": 1.2716485261917114,
"learning_rate": 7.945398552651837e-05,
"loss": 4.2711,
"step": 2910
},
{
"epoch": 0.6739563197830415,
"grad_norm": 1.1850802898406982,
"learning_rate": 7.844957309607061e-05,
"loss": 4.254,
"step": 2920
},
{
"epoch": 0.6762643893713396,
"grad_norm": 1.1652292013168335,
"learning_rate": 7.744929674873344e-05,
"loss": 4.2528,
"step": 2930
},
{
"epoch": 0.6785724589596376,
"grad_norm": 1.1236425638198853,
"learning_rate": 7.645321430702854e-05,
"loss": 4.2309,
"step": 2940
},
{
"epoch": 0.6808805285479357,
"grad_norm": 1.1567282676696777,
"learning_rate": 7.546138335104229e-05,
"loss": 4.2226,
"step": 2950
},
{
"epoch": 0.6831885981362338,
"grad_norm": 1.2129831314086914,
"learning_rate": 7.447386121509741e-05,
"loss": 4.2682,
"step": 2960
},
{
"epoch": 0.6854966677245319,
"grad_norm": 1.1564743518829346,
"learning_rate": 7.349070498443857e-05,
"loss": 4.2495,
"step": 2970
},
{
"epoch": 0.68780473731283,
"grad_norm": 1.230202317237854,
"learning_rate": 7.251197149193251e-05,
"loss": 4.2339,
"step": 2980
},
{
"epoch": 0.6901128069011281,
"grad_norm": 1.1715277433395386,
"learning_rate": 7.153771731478289e-05,
"loss": 4.2226,
"step": 2990
},
{
"epoch": 0.6924208764894262,
"grad_norm": 1.2347677946090698,
"learning_rate": 7.05679987712595e-05,
"loss": 4.227,
"step": 3000
},
{
"epoch": 0.6947289460777243,
"grad_norm": 1.19216787815094,
"learning_rate": 6.96028719174428e-05,
"loss": 4.2868,
"step": 3010
},
{
"epoch": 0.6970370156660224,
"grad_norm": 1.2830464839935303,
"learning_rate": 6.864239254398352e-05,
"loss": 4.2326,
"step": 3020
},
{
"epoch": 0.6993450852543204,
"grad_norm": 1.2899502515792847,
"learning_rate": 6.76866161728778e-05,
"loss": 4.2616,
"step": 3030
},
{
"epoch": 0.7011915409249588,
"eval_loss": 4.435102462768555,
"eval_runtime": 39.3746,
"eval_samples_per_second": 634.927,
"eval_steps_per_second": 79.366,
"step": 3038
},
{
"epoch": 0.7016531548426185,
"grad_norm": 1.1735284328460693,
"learning_rate": 6.67355980542571e-05,
"loss": 4.2741,
"step": 3040
},
{
"epoch": 0.7039612244309166,
"grad_norm": 1.2111891508102417,
"learning_rate": 6.578939316319502e-05,
"loss": 4.2271,
"step": 3050
},
{
"epoch": 0.7062692940192147,
"grad_norm": 1.188081979751587,
"learning_rate": 6.484805619652893e-05,
"loss": 4.2188,
"step": 3060
},
{
"epoch": 0.7085773636075128,
"grad_norm": 1.285130500793457,
"learning_rate": 6.391164156969856e-05,
"loss": 4.193,
"step": 3070
},
{
"epoch": 0.7108854331958109,
"grad_norm": 1.1954678297042847,
"learning_rate": 6.298020341359972e-05,
"loss": 4.211,
"step": 3080
},
{
"epoch": 0.713193502784109,
"grad_norm": 1.1639289855957031,
"learning_rate": 6.205379557145607e-05,
"loss": 4.2421,
"step": 3090
},
{
"epoch": 0.7155015723724071,
"grad_norm": 1.1945711374282837,
"learning_rate": 6.113247159570591e-05,
"loss": 4.2843,
"step": 3100
},
{
"epoch": 0.7178096419607051,
"grad_norm": 1.2462584972381592,
"learning_rate": 6.0216284744907036e-05,
"loss": 4.2239,
"step": 3110
},
{
"epoch": 0.7201177115490032,
"grad_norm": 1.2085964679718018,
"learning_rate": 5.930528798065741e-05,
"loss": 4.2027,
"step": 3120
},
{
"epoch": 0.7224257811373013,
"grad_norm": 1.1740282773971558,
"learning_rate": 5.839953396453442e-05,
"loss": 4.2056,
"step": 3130
},
{
"epoch": 0.7247338507255994,
"grad_norm": 1.1846504211425781,
"learning_rate": 5.749907505504999e-05,
"loss": 4.2292,
"step": 3140
},
{
"epoch": 0.7270419203138975,
"grad_norm": 1.3088330030441284,
"learning_rate": 5.660396330462448e-05,
"loss": 4.2503,
"step": 3150
},
{
"epoch": 0.7293499899021956,
"grad_norm": 1.2662092447280884,
"learning_rate": 5.571425045657711e-05,
"loss": 4.2536,
"step": 3160
},
{
"epoch": 0.7316580594904937,
"grad_norm": 1.1370505094528198,
"learning_rate": 5.4829987942135495e-05,
"loss": 4.1551,
"step": 3170
},
{
"epoch": 0.7339661290787918,
"grad_norm": 1.244957447052002,
"learning_rate": 5.395122687746217e-05,
"loss": 4.2295,
"step": 3180
},
{
"epoch": 0.7362741986670898,
"grad_norm": 1.2070027589797974,
"learning_rate": 5.3078018060699836e-05,
"loss": 4.2056,
"step": 3190
},
{
"epoch": 0.7385822682553879,
"grad_norm": 1.2402708530426025,
"learning_rate": 5.221041196903489e-05,
"loss": 4.243,
"step": 3200
},
{
"epoch": 0.740890337843686,
"grad_norm": 1.2046093940734863,
"learning_rate": 5.1348458755779706e-05,
"loss": 4.2083,
"step": 3210
},
{
"epoch": 0.7431984074319841,
"grad_norm": 1.2073742151260376,
"learning_rate": 5.049220824747306e-05,
"loss": 4.2024,
"step": 3220
},
{
"epoch": 0.7455064770202822,
"grad_norm": 1.1986374855041504,
"learning_rate": 4.964170994100019e-05,
"loss": 4.2975,
"step": 3230
},
{
"epoch": 0.7478145466085803,
"grad_norm": 1.2024635076522827,
"learning_rate": 4.879701300073134e-05,
"loss": 4.2499,
"step": 3240
},
{
"epoch": 0.7501226161968784,
"grad_norm": 1.20681893825531,
"learning_rate": 4.7958166255679787e-05,
"loss": 4.2109,
"step": 3250
},
{
"epoch": 0.7524306857851765,
"grad_norm": 1.2092540264129639,
"learning_rate": 4.712521819667936e-05,
"loss": 4.2221,
"step": 3260
},
{
"epoch": 0.7547387553734745,
"grad_norm": 1.1802047491073608,
"learning_rate": 4.629821697358108e-05,
"loss": 4.202,
"step": 3270
},
{
"epoch": 0.7570468249617726,
"grad_norm": 1.237534999847412,
"learning_rate": 4.5477210392469944e-05,
"loss": 4.2039,
"step": 3280
},
{
"epoch": 0.7593548945500707,
"grad_norm": 1.212430715560913,
"learning_rate": 4.4662245912901364e-05,
"loss": 4.2043,
"step": 3290
},
{
"epoch": 0.7616629641383688,
"grad_norm": 1.2303744554519653,
"learning_rate": 4.38533706451579e-05,
"loss": 4.2249,
"step": 3300
},
{
"epoch": 0.7639710337266669,
"grad_norm": 1.2151821851730347,
"learning_rate": 4.305063134752559e-05,
"loss": 4.2416,
"step": 3310
},
{
"epoch": 0.766279103314965,
"grad_norm": 1.1624999046325684,
"learning_rate": 4.225407442359134e-05,
"loss": 4.248,
"step": 3320
},
{
"epoch": 0.7685871729032631,
"grad_norm": 1.1886614561080933,
"learning_rate": 4.1463745919560296e-05,
"loss": 4.1549,
"step": 3330
},
{
"epoch": 0.7708952424915612,
"grad_norm": 1.141176700592041,
"learning_rate": 4.067969152159433e-05,
"loss": 4.1967,
"step": 3340
},
{
"epoch": 0.7732033120798593,
"grad_norm": 1.1527976989746094,
"learning_rate": 3.9901956553170714e-05,
"loss": 4.193,
"step": 3350
},
{
"epoch": 0.7755113816681573,
"grad_norm": 1.2025054693222046,
"learning_rate": 3.913058597246242e-05,
"loss": 4.1946,
"step": 3360
},
{
"epoch": 0.7778194512564554,
"grad_norm": 1.13848078250885,
"learning_rate": 3.836562436973906e-05,
"loss": 4.1719,
"step": 3370
},
{
"epoch": 0.7801275208447535,
"grad_norm": 1.2138718366622925,
"learning_rate": 3.7607115964789537e-05,
"loss": 4.2069,
"step": 3380
},
{
"epoch": 0.7824355904330516,
"grad_norm": 1.1834176778793335,
"learning_rate": 3.6855104604365485e-05,
"loss": 4.2246,
"step": 3390
},
{
"epoch": 0.7847436600213497,
"grad_norm": 1.157386064529419,
"learning_rate": 3.610963375964694e-05,
"loss": 4.2147,
"step": 3400
},
{
"epoch": 0.7870517296096478,
"grad_norm": 1.251615285873413,
"learning_rate": 3.5370746523729215e-05,
"loss": 4.2354,
"step": 3410
},
{
"epoch": 0.7893597991979459,
"grad_norm": 1.279531478881836,
"learning_rate": 3.463848560913199e-05,
"loss": 4.2083,
"step": 3420
},
{
"epoch": 0.791667868786244,
"grad_norm": 1.2485535144805908,
"learning_rate": 3.391289334533026e-05,
"loss": 4.1657,
"step": 3430
},
{
"epoch": 0.793975938374542,
"grad_norm": 1.2322598695755005,
"learning_rate": 3.3194011676307234e-05,
"loss": 4.1474,
"step": 3440
},
{
"epoch": 0.79628400796284,
"grad_norm": 1.2795343399047852,
"learning_rate": 3.248188215812985e-05,
"loss": 4.1557,
"step": 3450
},
{
"epoch": 0.7985920775511381,
"grad_norm": 1.236671805381775,
"learning_rate": 3.1776545956546473e-05,
"loss": 4.1628,
"step": 3460
},
{
"epoch": 0.8009001471394362,
"grad_norm": 1.163258671760559,
"learning_rate": 3.107804384460745e-05,
"loss": 4.2085,
"step": 3470
},
{
"epoch": 0.8013617610570959,
"eval_loss": 4.39634370803833,
"eval_runtime": 39.8062,
"eval_samples_per_second": 628.043,
"eval_steps_per_second": 78.505,
"step": 3472
},
{
"epoch": 0.8032082167277343,
"grad_norm": 1.2136709690093994,
"learning_rate": 3.0386416200307772e-05,
"loss": 4.2476,
"step": 3480
},
{
"epoch": 0.8055162863160324,
"grad_norm": 1.214560866355896,
"learning_rate": 2.970170300425341e-05,
"loss": 4.1994,
"step": 3490
},
{
"epoch": 0.8078243559043305,
"grad_norm": 1.2116317749023438,
"learning_rate": 2.9023943837349795e-05,
"loss": 4.1864,
"step": 3500
},
{
"epoch": 0.8101324254926286,
"grad_norm": 1.2190097570419312,
"learning_rate": 2.835317787851411e-05,
"loss": 4.2019,
"step": 3510
},
{
"epoch": 0.8124404950809266,
"grad_norm": 1.2359529733657837,
"learning_rate": 2.768944390241012e-05,
"loss": 4.1716,
"step": 3520
},
{
"epoch": 0.8147485646692247,
"grad_norm": 1.255321979522705,
"learning_rate": 2.703278027720713e-05,
"loss": 4.1866,
"step": 3530
},
{
"epoch": 0.8170566342575228,
"grad_norm": 1.2041914463043213,
"learning_rate": 2.6383224962361766e-05,
"loss": 4.2161,
"step": 3540
},
{
"epoch": 0.8193647038458209,
"grad_norm": 1.2864853143692017,
"learning_rate": 2.5740815506423917e-05,
"loss": 4.1654,
"step": 3550
},
{
"epoch": 0.821672773434119,
"grad_norm": 1.3623309135437012,
"learning_rate": 2.51055890448658e-05,
"loss": 4.2003,
"step": 3560
},
{
"epoch": 0.8239808430224171,
"grad_norm": 1.291591763496399,
"learning_rate": 2.44775822979358e-05,
"loss": 4.159,
"step": 3570
},
{
"epoch": 0.8262889126107152,
"grad_norm": 1.2855194807052612,
"learning_rate": 2.3856831568535307e-05,
"loss": 4.1886,
"step": 3580
},
{
"epoch": 0.8285969821990133,
"grad_norm": 1.2652373313903809,
"learning_rate": 2.324337274012061e-05,
"loss": 4.1722,
"step": 3590
},
{
"epoch": 0.8309050517873113,
"grad_norm": 1.2906244993209839,
"learning_rate": 2.2637241274628108e-05,
"loss": 4.1888,
"step": 3600
},
{
"epoch": 0.8332131213756094,
"grad_norm": 1.2488876581192017,
"learning_rate": 2.2038472210424952e-05,
"loss": 4.2159,
"step": 3610
},
{
"epoch": 0.8355211909639075,
"grad_norm": 1.2642358541488647,
"learning_rate": 2.1447100160283082e-05,
"loss": 4.1982,
"step": 3620
},
{
"epoch": 0.8378292605522056,
"grad_norm": 1.2974900007247925,
"learning_rate": 2.0863159309378657e-05,
"loss": 4.2046,
"step": 3630
},
{
"epoch": 0.8401373301405037,
"grad_norm": 1.2382539510726929,
"learning_rate": 2.0286683413315873e-05,
"loss": 4.1495,
"step": 3640
},
{
"epoch": 0.8424453997288018,
"grad_norm": 1.25460946559906,
"learning_rate": 1.9717705796175727e-05,
"loss": 4.2023,
"step": 3650
},
{
"epoch": 0.8447534693170999,
"grad_norm": 1.22829270362854,
"learning_rate": 1.9156259348589514e-05,
"loss": 4.1346,
"step": 3660
},
{
"epoch": 0.847061538905398,
"grad_norm": 1.2403064966201782,
"learning_rate": 1.8602376525837655e-05,
"loss": 4.1988,
"step": 3670
},
{
"epoch": 0.8493696084936961,
"grad_norm": 1.2172107696533203,
"learning_rate": 1.8056089345973536e-05,
"loss": 4.2222,
"step": 3680
},
{
"epoch": 0.8516776780819941,
"grad_norm": 1.2095916271209717,
"learning_rate": 1.7517429387972608e-05,
"loss": 4.1647,
"step": 3690
},
{
"epoch": 0.8539857476702922,
"grad_norm": 1.2624644041061401,
"learning_rate": 1.6986427789907115e-05,
"loss": 4.2337,
"step": 3700
},
{
"epoch": 0.8562938172585903,
"grad_norm": 1.2449021339416504,
"learning_rate": 1.6463115247145782e-05,
"loss": 4.1926,
"step": 3710
},
{
"epoch": 0.8586018868468884,
"grad_norm": 1.2478352785110474,
"learning_rate": 1.594752201057968e-05,
"loss": 4.1702,
"step": 3720
},
{
"epoch": 0.8609099564351865,
"grad_norm": 1.263993740081787,
"learning_rate": 1.5439677884873424e-05,
"loss": 4.1817,
"step": 3730
},
{
"epoch": 0.8632180260234846,
"grad_norm": 1.1771023273468018,
"learning_rate": 1.4939612226742347e-05,
"loss": 4.1884,
"step": 3740
},
{
"epoch": 0.8655260956117827,
"grad_norm": 1.2805135250091553,
"learning_rate": 1.4447353943255341e-05,
"loss": 4.1973,
"step": 3750
},
{
"epoch": 0.8678341652000808,
"grad_norm": 1.2390002012252808,
"learning_rate": 1.3962931490163992e-05,
"loss": 4.163,
"step": 3760
},
{
"epoch": 0.8701422347883788,
"grad_norm": 1.1928465366363525,
"learning_rate": 1.3486372870257539e-05,
"loss": 4.2661,
"step": 3770
},
{
"epoch": 0.8724503043766769,
"grad_norm": 1.227087378501892,
"learning_rate": 1.3017705631744263e-05,
"loss": 4.1941,
"step": 3780
},
{
"epoch": 0.874758373964975,
"grad_norm": 1.2422763109207153,
"learning_rate": 1.255695686665883e-05,
"loss": 4.1729,
"step": 3790
},
{
"epoch": 0.8770664435532731,
"grad_norm": 1.2221300601959229,
"learning_rate": 1.2104153209296374e-05,
"loss": 4.1766,
"step": 3800
},
{
"epoch": 0.8793745131415712,
"grad_norm": 1.2882457971572876,
"learning_rate": 1.1659320834672753e-05,
"loss": 4.2104,
"step": 3810
},
{
"epoch": 0.8816825827298693,
"grad_norm": 1.3336913585662842,
"learning_rate": 1.1222485457011516e-05,
"loss": 4.2294,
"step": 3820
},
{
"epoch": 0.8839906523181674,
"grad_norm": 1.1365299224853516,
"learning_rate": 1.079367232825743e-05,
"loss": 4.1763,
"step": 3830
},
{
"epoch": 0.8862987219064655,
"grad_norm": 1.3608059883117676,
"learning_rate": 1.0372906236616734e-05,
"loss": 4.2236,
"step": 3840
},
{
"epoch": 0.8886067914947635,
"grad_norm": 1.3224635124206543,
"learning_rate": 9.960211505124215e-06,
"loss": 4.1455,
"step": 3850
},
{
"epoch": 0.8909148610830616,
"grad_norm": 1.2558730840682983,
"learning_rate": 9.555611990237317e-06,
"loss": 4.1669,
"step": 3860
},
{
"epoch": 0.8932229306713597,
"grad_norm": 1.2785941362380981,
"learning_rate": 9.159131080456839e-06,
"loss": 4.1952,
"step": 3870
},
{
"epoch": 0.8955310002596578,
"grad_norm": 1.2829550504684448,
"learning_rate": 8.770791694975093e-06,
"loss": 4.1654,
"step": 3880
},
{
"epoch": 0.8978390698479559,
"grad_norm": 1.3000894784927368,
"learning_rate": 8.390616282350992e-06,
"loss": 4.223,
"step": 3890
},
{
"epoch": 0.900147139436254,
"grad_norm": 1.4616061449050903,
"learning_rate": 8.018626819212354e-06,
"loss": 4.1981,
"step": 3900
},
{
"epoch": 0.9015319811892328,
"eval_loss": 4.381021022796631,
"eval_runtime": 39.549,
"eval_samples_per_second": 632.127,
"eval_steps_per_second": 79.016,
"step": 3906
},
{
"epoch": 0.9024552090245521,
"grad_norm": 1.295082449913025,
"learning_rate": 7.654844808985617e-06,
"loss": 4.1607,
"step": 3910
},
{
"epoch": 0.9047632786128502,
"grad_norm": 1.3329360485076904,
"learning_rate": 7.299291280652503e-06,
"loss": 4.1855,
"step": 3920
},
{
"epoch": 0.9070713482011483,
"grad_norm": 1.2815297842025757,
"learning_rate": 6.951986787534824e-06,
"loss": 4.2036,
"step": 3930
},
{
"epoch": 0.9093794177894463,
"grad_norm": 1.2049307823181152,
"learning_rate": 6.612951406106015e-06,
"loss": 4.1678,
"step": 3940
},
{
"epoch": 0.9116874873777444,
"grad_norm": 1.3223621845245361,
"learning_rate": 6.282204734830826e-06,
"loss": 4.1758,
"step": 3950
},
{
"epoch": 0.9139955569660425,
"grad_norm": 1.3526756763458252,
"learning_rate": 5.959765893032131e-06,
"loss": 4.1829,
"step": 3960
},
{
"epoch": 0.9163036265543406,
"grad_norm": 1.3002768754959106,
"learning_rate": 5.645653519786064e-06,
"loss": 4.1908,
"step": 3970
},
{
"epoch": 0.9186116961426387,
"grad_norm": 1.294180989265442,
"learning_rate": 5.339885772844227e-06,
"loss": 4.1767,
"step": 3980
},
{
"epoch": 0.9209197657309368,
"grad_norm": 1.2366904020309448,
"learning_rate": 5.042480327584231e-06,
"loss": 4.1572,
"step": 3990
},
{
"epoch": 0.9232278353192349,
"grad_norm": 1.2950676679611206,
"learning_rate": 4.753454375987898e-06,
"loss": 4.1748,
"step": 4000
},
{
"epoch": 0.925535904907533,
"grad_norm": 1.3021730184555054,
"learning_rate": 4.472824625647503e-06,
"loss": 4.1417,
"step": 4010
},
{
"epoch": 0.927843974495831,
"grad_norm": 1.3200892210006714,
"learning_rate": 4.2006072987998355e-06,
"loss": 4.1742,
"step": 4020
},
{
"epoch": 0.9301520440841291,
"grad_norm": 1.2840920686721802,
"learning_rate": 3.9368181313886085e-06,
"loss": 4.1808,
"step": 4030
},
{
"epoch": 0.9324601136724272,
"grad_norm": 1.2291038036346436,
"learning_rate": 3.681472372154659e-06,
"loss": 4.2205,
"step": 4040
},
{
"epoch": 0.9347681832607253,
"grad_norm": 1.2467334270477295,
"learning_rate": 3.434584781754668e-06,
"loss": 4.2397,
"step": 4050
},
{
"epoch": 0.9370762528490234,
"grad_norm": 1.292160153388977,
"learning_rate": 3.196169631907658e-06,
"loss": 4.1912,
"step": 4060
},
{
"epoch": 0.9393843224373215,
"grad_norm": 1.3593335151672363,
"learning_rate": 2.966240704570205e-06,
"loss": 4.1743,
"step": 4070
},
{
"epoch": 0.9416923920256196,
"grad_norm": 1.2135837078094482,
"learning_rate": 2.7448112911396257e-06,
"loss": 4.1725,
"step": 4080
},
{
"epoch": 0.9440004616139177,
"grad_norm": 1.2826229333877563,
"learning_rate": 2.5318941916857236e-06,
"loss": 4.1779,
"step": 4090
},
{
"epoch": 0.9463085312022157,
"grad_norm": 1.3116379976272583,
"learning_rate": 2.327501714210783e-06,
"loss": 4.1753,
"step": 4100
},
{
"epoch": 0.9486166007905138,
"grad_norm": 1.2221295833587646,
"learning_rate": 2.1316456739381373e-06,
"loss": 4.1947,
"step": 4110
},
{
"epoch": 0.9509246703788119,
"grad_norm": 1.2066534757614136,
"learning_rate": 1.9443373926291806e-06,
"loss": 4.1914,
"step": 4120
},
{
"epoch": 0.95323273996711,
"grad_norm": 1.2396924495697021,
"learning_rate": 1.765587697928844e-06,
"loss": 4.1717,
"step": 4130
},
{
"epoch": 0.9555408095554081,
"grad_norm": 1.2560229301452637,
"learning_rate": 1.5954069227397782e-06,
"loss": 4.1566,
"step": 4140
},
{
"epoch": 0.9578488791437062,
"grad_norm": 1.4303022623062134,
"learning_rate": 1.43380490462493e-06,
"loss": 4.1762,
"step": 4150
},
{
"epoch": 0.9601569487320043,
"grad_norm": 1.2595306634902954,
"learning_rate": 1.2807909852389498e-06,
"loss": 4.1753,
"step": 4160
},
{
"epoch": 0.9624650183203024,
"grad_norm": 1.1923511028289795,
"learning_rate": 1.1363740097881547e-06,
"loss": 4.1417,
"step": 4170
},
{
"epoch": 0.9647730879086004,
"grad_norm": 1.1929636001586914,
"learning_rate": 1.0005623265192053e-06,
"loss": 4.1698,
"step": 4180
},
{
"epoch": 0.9670811574968985,
"grad_norm": 1.2409882545471191,
"learning_rate": 8.733637862365251e-07,
"loss": 4.1773,
"step": 4190
},
{
"epoch": 0.9693892270851966,
"grad_norm": 1.2641384601593018,
"learning_rate": 7.547857418485459e-07,
"loss": 4.1271,
"step": 4200
},
{
"epoch": 0.9716972966734947,
"grad_norm": 1.1365667581558228,
"learning_rate": 6.448350479425157e-07,
"loss": 4.14,
"step": 4210
},
{
"epoch": 0.9740053662617928,
"grad_norm": 1.2669204473495483,
"learning_rate": 5.435180603884148e-07,
"loss": 4.1989,
"step": 4220
},
{
"epoch": 0.9763134358500909,
"grad_norm": 1.2706636190414429,
"learning_rate": 4.508406359714001e-07,
"loss": 4.1727,
"step": 4230
},
{
"epoch": 0.978621505438389,
"grad_norm": 1.293450951576233,
"learning_rate": 3.6680813205339223e-07,
"loss": 4.1872,
"step": 4240
},
{
"epoch": 0.9809295750266871,
"grad_norm": 1.2493462562561035,
"learning_rate": 2.9142540626325707e-07,
"loss": 4.1937,
"step": 4250
},
{
"epoch": 0.9832376446149852,
"grad_norm": 1.265554428100586,
"learning_rate": 2.246968162160634e-07,
"loss": 4.1332,
"step": 4260
},
{
"epoch": 0.9855457142032832,
"grad_norm": 1.2552322149276733,
"learning_rate": 1.6662621926118446e-07,
"loss": 4.221,
"step": 4270
},
{
"epoch": 0.9878537837915813,
"grad_norm": 1.2385802268981934,
"learning_rate": 1.1721697225932636e-07,
"loss": 4.1898,
"step": 4280
},
{
"epoch": 0.9901618533798794,
"grad_norm": 1.358494520187378,
"learning_rate": 7.647193138843322e-08,
"loss": 4.1357,
"step": 4290
},
{
"epoch": 0.9924699229681775,
"grad_norm": 1.174631953239441,
"learning_rate": 4.439345197861932e-08,
"loss": 4.1818,
"step": 4300
},
{
"epoch": 0.9947779925564756,
"grad_norm": 1.249579668045044,
"learning_rate": 2.0983388376011324e-08,
"loss": 4.1376,
"step": 4310
},
{
"epoch": 0.9970860621447737,
"grad_norm": 1.2420294284820557,
"learning_rate": 6.243093835567314e-09,
"loss": 4.1601,
"step": 4320
},
{
"epoch": 0.9993941317330718,
"grad_norm": 1.2425047159194946,
"learning_rate": 1.734204427727981e-10,
"loss": 4.1714,
"step": 4330
}
],
"logging_steps": 10,
"max_steps": 4332,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 434,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2436421205975040.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}