| { | |
| "best_metric": 4.381021022796631, | |
| "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/mix-bpe/checkpoint-3906", | |
| "epoch": 0.9998557456507313, | |
| "eval_steps": 434, | |
| "global_step": 4332, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002308069588298087, | |
| "grad_norm": 1.293995976448059, | |
| "learning_rate": 1.4999999999999999e-05, | |
| "loss": 8.3864, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004616139176596174, | |
| "grad_norm": 1.6057629585266113, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 8.3534, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006924208764894262, | |
| "grad_norm": 1.8857852220535278, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 8.2432, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.009232278353192349, | |
| "grad_norm": 1.6119285821914673, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 8.0835, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.011540347941490435, | |
| "grad_norm": 1.474076747894287, | |
| "learning_rate": 7.5e-05, | |
| "loss": 7.8521, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.013848417529788524, | |
| "grad_norm": 1.3890337944030762, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 7.6704, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01615648711808661, | |
| "grad_norm": 1.7293298244476318, | |
| "learning_rate": 0.00010499999999999999, | |
| "loss": 7.4797, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.018464556706384697, | |
| "grad_norm": 1.161170244216919, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 7.2628, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.020772626294682784, | |
| "grad_norm": 1.2024428844451904, | |
| "learning_rate": 0.000135, | |
| "loss": 7.0311, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02308069588298087, | |
| "grad_norm": 0.9382893443107605, | |
| "learning_rate": 0.00015, | |
| "loss": 6.79, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.025388765471278957, | |
| "grad_norm": 0.9623205661773682, | |
| "learning_rate": 0.000165, | |
| "loss": 6.6329, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.027696835059577048, | |
| "grad_norm": 0.5374640822410583, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 6.4776, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.030004904647875134, | |
| "grad_norm": 0.37330469489097595, | |
| "learning_rate": 0.000195, | |
| "loss": 6.4591, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03231297423617322, | |
| "grad_norm": 0.4222196340560913, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 6.4463, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03462104382447131, | |
| "grad_norm": 0.4244931638240814, | |
| "learning_rate": 0.000225, | |
| "loss": 6.4109, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.036929113412769395, | |
| "grad_norm": 0.5667627453804016, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 6.3945, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03923718300106748, | |
| "grad_norm": 0.4752316474914551, | |
| "learning_rate": 0.00025499999999999996, | |
| "loss": 6.3878, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04154525258936557, | |
| "grad_norm": 0.5644646883010864, | |
| "learning_rate": 0.00027, | |
| "loss": 6.3093, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.043853322177663655, | |
| "grad_norm": 0.6428855657577515, | |
| "learning_rate": 0.000285, | |
| "loss": 6.3548, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04616139176596174, | |
| "grad_norm": 0.8332350850105286, | |
| "learning_rate": 0.0003, | |
| "loss": 6.2497, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04846946135425983, | |
| "grad_norm": 0.8160709142684937, | |
| "learning_rate": 0.0002999956645089803, | |
| "loss": 6.1748, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.050777530942557915, | |
| "grad_norm": 0.7665246725082397, | |
| "learning_rate": 0.000299982658286541, | |
| "loss": 6.1079, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05308560053085601, | |
| "grad_norm": 0.7429030537605286, | |
| "learning_rate": 0.00029996098208452687, | |
| "loss": 6.032, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.055393670119154095, | |
| "grad_norm": 0.9278781414031982, | |
| "learning_rate": 0.0002999306371559644, | |
| "loss": 6.023, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05770173970745218, | |
| "grad_norm": 0.7202680706977844, | |
| "learning_rate": 0.00029989162525498905, | |
| "loss": 5.9386, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06000980929575027, | |
| "grad_norm": 0.7615482807159424, | |
| "learning_rate": 0.000299843948636744, | |
| "loss": 5.8888, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.062317878884048356, | |
| "grad_norm": 0.7727493643760681, | |
| "learning_rate": 0.00029978761005725014, | |
| "loss": 5.8483, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06462594847234644, | |
| "grad_norm": 0.8060325980186462, | |
| "learning_rate": 0.0002997226127732461, | |
| "loss": 5.8168, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06693401806064453, | |
| "grad_norm": 0.801364541053772, | |
| "learning_rate": 0.0002996489605420004, | |
| "loss": 5.7915, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.06924208764894262, | |
| "grad_norm": 0.6761994957923889, | |
| "learning_rate": 0.0002995666576210942, | |
| "loss": 5.816, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0715501572372407, | |
| "grad_norm": 0.854761004447937, | |
| "learning_rate": 0.0002994757087681753, | |
| "loss": 5.7237, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07385822682553879, | |
| "grad_norm": 0.810724675655365, | |
| "learning_rate": 0.0002993761192406826, | |
| "loss": 5.6849, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07616629641383688, | |
| "grad_norm": 0.7817623615264893, | |
| "learning_rate": 0.000299267894795543, | |
| "loss": 5.6347, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.07847436600213496, | |
| "grad_norm": 0.7851743698120117, | |
| "learning_rate": 0.0002991510416888378, | |
| "loss": 5.5995, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08078243559043305, | |
| "grad_norm": 0.736893355846405, | |
| "learning_rate": 0.0002990255666754418, | |
| "loss": 5.6445, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08309050517873114, | |
| "grad_norm": 0.8936936855316162, | |
| "learning_rate": 0.00029889147700863205, | |
| "loss": 5.6018, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08539857476702922, | |
| "grad_norm": 0.852159857749939, | |
| "learning_rate": 0.00029874878043966926, | |
| "loss": 5.5471, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.08770664435532731, | |
| "grad_norm": 0.7617043256759644, | |
| "learning_rate": 0.0002985974852173493, | |
| "loss": 5.5397, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0900147139436254, | |
| "grad_norm": 0.7751661539077759, | |
| "learning_rate": 0.0002984376000875267, | |
| "loss": 5.4445, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09232278353192348, | |
| "grad_norm": 0.8484746813774109, | |
| "learning_rate": 0.00029826913429260843, | |
| "loss": 5.4171, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09463085312022157, | |
| "grad_norm": 0.885892391204834, | |
| "learning_rate": 0.0002980920975710206, | |
| "loss": 5.4414, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.09693892270851966, | |
| "grad_norm": 0.8213350176811218, | |
| "learning_rate": 0.0002979065001566447, | |
| "loss": 5.4237, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.09924699229681774, | |
| "grad_norm": 0.839364767074585, | |
| "learning_rate": 0.00029771235277822633, | |
| "loss": 5.4203, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.10017022013213699, | |
| "eval_loss": 5.607814311981201, | |
| "eval_runtime": 38.9963, | |
| "eval_samples_per_second": 641.087, | |
| "eval_steps_per_second": 80.136, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.10155506188511583, | |
| "grad_norm": 0.8963534832000732, | |
| "learning_rate": 0.0002975096666587551, | |
| "loss": 5.3826, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.10386313147341392, | |
| "grad_norm": 0.9185658097267151, | |
| "learning_rate": 0.0002972984535148157, | |
| "loss": 5.3868, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10617120106171202, | |
| "grad_norm": 0.9081022143363953, | |
| "learning_rate": 0.0002970787255559106, | |
| "loss": 5.4027, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1084792706500101, | |
| "grad_norm": 0.8252591490745544, | |
| "learning_rate": 0.00029685049548375426, | |
| "loss": 5.3417, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.11078734023830819, | |
| "grad_norm": 0.8946505784988403, | |
| "learning_rate": 0.0002966137764915393, | |
| "loss": 5.2916, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.11309540982660628, | |
| "grad_norm": 0.8276567459106445, | |
| "learning_rate": 0.00029636858226317304, | |
| "loss": 5.2734, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11540347941490436, | |
| "grad_norm": 0.8609122037887573, | |
| "learning_rate": 0.00029611492697248726, | |
| "loss": 5.3293, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11771154900320245, | |
| "grad_norm": 0.9473890662193298, | |
| "learning_rate": 0.0002958528252824184, | |
| "loss": 5.3, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.12001961859150054, | |
| "grad_norm": 0.8542040586471558, | |
| "learning_rate": 0.0002955822923441601, | |
| "loss": 5.2721, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.12232768817979862, | |
| "grad_norm": 0.8030778169631958, | |
| "learning_rate": 0.00029530334379628735, | |
| "loss": 5.2909, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.12463575776809671, | |
| "grad_norm": 0.8569799065589905, | |
| "learning_rate": 0.0002950159957638525, | |
| "loss": 5.2491, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.12694382735639478, | |
| "grad_norm": 0.8883563280105591, | |
| "learning_rate": 0.00029472026485745297, | |
| "loss": 5.1771, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12925189694469288, | |
| "grad_norm": 0.9277822971343994, | |
| "learning_rate": 0.00029441616817227145, | |
| "loss": 5.1982, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.13155996653299096, | |
| "grad_norm": 0.888548731803894, | |
| "learning_rate": 0.0002941037232870871, | |
| "loss": 5.2259, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.13386803612128906, | |
| "grad_norm": 0.8762586712837219, | |
| "learning_rate": 0.00029378294826325993, | |
| "loss": 5.2699, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.13617610570958713, | |
| "grad_norm": 1.1397993564605713, | |
| "learning_rate": 0.0002934538616436863, | |
| "loss": 5.2092, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.13848417529788523, | |
| "grad_norm": 0.8557673096656799, | |
| "learning_rate": 0.0002931164824517275, | |
| "loss": 5.2417, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.14079224488618333, | |
| "grad_norm": 0.8277891874313354, | |
| "learning_rate": 0.00029277083019010945, | |
| "loss": 5.16, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1431003144744814, | |
| "grad_norm": 0.8815522193908691, | |
| "learning_rate": 0.00029241692483979593, | |
| "loss": 5.1709, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1454083840627795, | |
| "grad_norm": 0.9206939339637756, | |
| "learning_rate": 0.0002920547868588331, | |
| "loss": 5.1484, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.14771645365107758, | |
| "grad_norm": 0.8649567365646362, | |
| "learning_rate": 0.00029168443718116725, | |
| "loss": 5.1292, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.15002452323937568, | |
| "grad_norm": 0.8811323642730713, | |
| "learning_rate": 0.00029130589721543433, | |
| "loss": 5.1518, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.15233259282767375, | |
| "grad_norm": 0.8579047918319702, | |
| "learning_rate": 0.0002909191888437227, | |
| "loss": 5.0781, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.15464066241597185, | |
| "grad_norm": 0.9432843923568726, | |
| "learning_rate": 0.00029052433442030797, | |
| "loss": 5.1191, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.15694873200426993, | |
| "grad_norm": 0.8163111805915833, | |
| "learning_rate": 0.00029012135677036077, | |
| "loss": 5.0546, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.15925680159256803, | |
| "grad_norm": 0.8901731967926025, | |
| "learning_rate": 0.00028971027918862777, | |
| "loss": 5.0731, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.1615648711808661, | |
| "grad_norm": 0.9626258015632629, | |
| "learning_rate": 0.00028929112543808435, | |
| "loss": 5.0357, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1638729407691642, | |
| "grad_norm": 0.898350179195404, | |
| "learning_rate": 0.0002888639197485614, | |
| "loss": 5.1027, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.16618101035746227, | |
| "grad_norm": 0.9431145787239075, | |
| "learning_rate": 0.00028842868681534486, | |
| "loss": 4.9996, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.16848907994576037, | |
| "grad_norm": 0.8631531596183777, | |
| "learning_rate": 0.0002879854517977475, | |
| "loss": 5.0532, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.17079714953405845, | |
| "grad_norm": 0.9095075726509094, | |
| "learning_rate": 0.0002875342403176553, | |
| "loss": 5.0117, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.17310521912235655, | |
| "grad_norm": 0.8849276304244995, | |
| "learning_rate": 0.00028707507845804575, | |
| "loss": 4.997, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.17541328871065462, | |
| "grad_norm": 1.015526533126831, | |
| "learning_rate": 0.00028660799276148053, | |
| "loss": 5.017, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.17772135829895272, | |
| "grad_norm": 0.9242818355560303, | |
| "learning_rate": 0.00028613301022857086, | |
| "loss": 5.0325, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.1800294278872508, | |
| "grad_norm": 0.871295154094696, | |
| "learning_rate": 0.0002856501583164168, | |
| "loss": 5.0235, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.1823374974755489, | |
| "grad_norm": 0.9293336868286133, | |
| "learning_rate": 0.0002851594649370201, | |
| "loss": 5.0311, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.18464556706384697, | |
| "grad_norm": 0.8775876760482788, | |
| "learning_rate": 0.00028466095845567057, | |
| "loss": 5.0069, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.18695363665214507, | |
| "grad_norm": 0.9968228340148926, | |
| "learning_rate": 0.0002841546676893065, | |
| "loss": 4.9595, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.18926170624044314, | |
| "grad_norm": 0.9142509698867798, | |
| "learning_rate": 0.000283640621904849, | |
| "loss": 4.9601, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.19156977582874124, | |
| "grad_norm": 0.863057017326355, | |
| "learning_rate": 0.0002831188508175096, | |
| "loss": 4.9811, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1938778454170393, | |
| "grad_norm": 0.8651947379112244, | |
| "learning_rate": 0.00028258938458907334, | |
| "loss": 4.9288, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1961859150053374, | |
| "grad_norm": 0.8541522026062012, | |
| "learning_rate": 0.0002820522538261545, | |
| "loss": 4.9037, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1984939845936355, | |
| "grad_norm": 0.9082189202308655, | |
| "learning_rate": 0.0002815074895784278, | |
| "loss": 4.8757, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.20034044026427397, | |
| "eval_loss": 5.134793758392334, | |
| "eval_runtime": 39.0568, | |
| "eval_samples_per_second": 640.093, | |
| "eval_steps_per_second": 80.012, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.2008020541819336, | |
| "grad_norm": 0.8799365162849426, | |
| "learning_rate": 0.0002809551233368332, | |
| "loss": 4.942, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.20311012377023166, | |
| "grad_norm": 0.9927550554275513, | |
| "learning_rate": 0.00028039518703175577, | |
| "loss": 4.8919, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.20541819335852976, | |
| "grad_norm": 0.9068810343742371, | |
| "learning_rate": 0.00027982771303117996, | |
| "loss": 4.9315, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.20772626294682783, | |
| "grad_norm": 0.975292980670929, | |
| "learning_rate": 0.000279252734138818, | |
| "loss": 4.919, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.21003433253512593, | |
| "grad_norm": 0.8984159231185913, | |
| "learning_rate": 0.0002786702835922144, | |
| "loss": 4.8989, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.21234240212342403, | |
| "grad_norm": 0.9731834530830383, | |
| "learning_rate": 0.0002780803950608239, | |
| "loss": 4.8991, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2146504717117221, | |
| "grad_norm": 0.932558536529541, | |
| "learning_rate": 0.00027748310264406564, | |
| "loss": 4.8866, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2169585413000202, | |
| "grad_norm": 0.9369585514068604, | |
| "learning_rate": 0.00027687844086935176, | |
| "loss": 4.8829, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.21926661088831828, | |
| "grad_norm": 0.9317886829376221, | |
| "learning_rate": 0.0002762664446900914, | |
| "loss": 4.8399, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.22157468047661638, | |
| "grad_norm": 0.9692897796630859, | |
| "learning_rate": 0.00027564714948367046, | |
| "loss": 4.8339, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.22388275006491445, | |
| "grad_norm": 0.912087082862854, | |
| "learning_rate": 0.0002750205910494064, | |
| "loss": 4.8176, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.22619081965321255, | |
| "grad_norm": 0.9403560757637024, | |
| "learning_rate": 0.00027438680560647877, | |
| "loss": 4.8227, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.22849888924151063, | |
| "grad_norm": 0.9240433573722839, | |
| "learning_rate": 0.0002737458297918355, | |
| "loss": 4.8053, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.23080695882980873, | |
| "grad_norm": 0.9296719431877136, | |
| "learning_rate": 0.000273097700658075, | |
| "loss": 4.8571, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2331150284181068, | |
| "grad_norm": 0.9312313199043274, | |
| "learning_rate": 0.0002724424556713046, | |
| "loss": 4.8177, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.2354230980064049, | |
| "grad_norm": 0.9134889245033264, | |
| "learning_rate": 0.0002717801327089743, | |
| "loss": 4.7824, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.23773116759470297, | |
| "grad_norm": 0.8830430507659912, | |
| "learning_rate": 0.0002711107700576875, | |
| "loss": 4.7996, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.24003923718300108, | |
| "grad_norm": 0.9481594562530518, | |
| "learning_rate": 0.00027043440641098777, | |
| "loss": 4.8118, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.24234730677129915, | |
| "grad_norm": 0.9204795956611633, | |
| "learning_rate": 0.0002697510808671219, | |
| "loss": 4.7847, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.24465537635959725, | |
| "grad_norm": 0.9537863731384277, | |
| "learning_rate": 0.0002690608329267801, | |
| "loss": 4.79, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.24696344594789532, | |
| "grad_norm": 0.9290711283683777, | |
| "learning_rate": 0.00026836370249081235, | |
| "loss": 4.7671, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.24927151553619342, | |
| "grad_norm": 0.9401620626449585, | |
| "learning_rate": 0.00026765972985792183, | |
| "loss": 4.7715, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.2515795851244915, | |
| "grad_norm": 0.9478754997253418, | |
| "learning_rate": 0.00026694895572233556, | |
| "loss": 4.8047, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.25388765471278957, | |
| "grad_norm": 0.9853310585021973, | |
| "learning_rate": 0.000266231421171452, | |
| "loss": 4.744, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.25619572430108767, | |
| "grad_norm": 0.9713465571403503, | |
| "learning_rate": 0.0002655071676834659, | |
| "loss": 4.7124, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.25850379388938577, | |
| "grad_norm": 0.9828191995620728, | |
| "learning_rate": 0.00026477623712497047, | |
| "loss": 4.7588, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.26081186347768387, | |
| "grad_norm": 0.9578918814659119, | |
| "learning_rate": 0.0002640386717485373, | |
| "loss": 4.7603, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.2631199330659819, | |
| "grad_norm": 0.9244778156280518, | |
| "learning_rate": 0.0002632945141902739, | |
| "loss": 4.7823, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.26542800265428, | |
| "grad_norm": 1.0172864198684692, | |
| "learning_rate": 0.00026254380746735926, | |
| "loss": 4.7744, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2677360722425781, | |
| "grad_norm": 0.9416136741638184, | |
| "learning_rate": 0.00026178659497555663, | |
| "loss": 4.7059, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.2700441418308762, | |
| "grad_norm": 0.8836077451705933, | |
| "learning_rate": 0.0002610229204867055, | |
| "loss": 4.7969, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.27235221141917426, | |
| "grad_norm": 0.9215325713157654, | |
| "learning_rate": 0.000260252828146191, | |
| "loss": 4.7436, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.27466028100747236, | |
| "grad_norm": 0.9629151225090027, | |
| "learning_rate": 0.0002594763624703922, | |
| "loss": 4.7368, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.27696835059577046, | |
| "grad_norm": 1.0467829704284668, | |
| "learning_rate": 0.00025869356834410864, | |
| "loss": 4.6909, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.27927642018406856, | |
| "grad_norm": 0.9704943299293518, | |
| "learning_rate": 0.00025790449101796575, | |
| "loss": 4.6959, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.28158448977236666, | |
| "grad_norm": 0.9856391549110413, | |
| "learning_rate": 0.0002571091761057989, | |
| "loss": 4.6771, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.2838925593606647, | |
| "grad_norm": 1.040661096572876, | |
| "learning_rate": 0.00025630766958201695, | |
| "loss": 4.7199, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.2862006289489628, | |
| "grad_norm": 0.9736753702163696, | |
| "learning_rate": 0.0002555000177789444, | |
| "loss": 4.6799, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.2885086985372609, | |
| "grad_norm": 1.006039023399353, | |
| "learning_rate": 0.00025468626738414305, | |
| "loss": 4.694, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.290816768125559, | |
| "grad_norm": 0.96723473072052, | |
| "learning_rate": 0.0002538664654377134, | |
| "loss": 4.6445, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.29312483771385706, | |
| "grad_norm": 0.9628943800926208, | |
| "learning_rate": 0.00025304065932957494, | |
| "loss": 4.6465, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.29543290730215516, | |
| "grad_norm": 0.9398277997970581, | |
| "learning_rate": 0.00025220889679672745, | |
| "loss": 4.6037, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.29774097689045326, | |
| "grad_norm": 0.9146639108657837, | |
| "learning_rate": 0.00025137122592049066, | |
| "loss": 4.6605, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.30004904647875136, | |
| "grad_norm": 0.9692845940589905, | |
| "learning_rate": 0.0002505276951237254, | |
| "loss": 4.6331, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.30051066039641094, | |
| "eval_loss": 4.868031978607178, | |
| "eval_runtime": 39.4278, | |
| "eval_samples_per_second": 634.071, | |
| "eval_steps_per_second": 79.259, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 0.3023571160670494, | |
| "grad_norm": 0.9402655959129333, | |
| "learning_rate": 0.00024967835316803434, | |
| "loss": 4.6622, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3046651856553475, | |
| "grad_norm": 0.9427255392074585, | |
| "learning_rate": 0.00024882324915094305, | |
| "loss": 4.6359, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3069732552436456, | |
| "grad_norm": 0.9967660307884216, | |
| "learning_rate": 0.00024796243250306196, | |
| "loss": 4.6153, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.3092813248319437, | |
| "grad_norm": 0.9784890413284302, | |
| "learning_rate": 0.00024709595298522916, | |
| "loss": 4.6401, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.31158939442024175, | |
| "grad_norm": 1.011221170425415, | |
| "learning_rate": 0.00024622386068563344, | |
| "loss": 4.5711, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.31389746400853985, | |
| "grad_norm": 1.0162895917892456, | |
| "learning_rate": 0.0002453462060169193, | |
| "loss": 4.6102, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.31620553359683795, | |
| "grad_norm": 0.9559857845306396, | |
| "learning_rate": 0.00024446303971327254, | |
| "loss": 4.6215, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.31851360318513605, | |
| "grad_norm": 0.9640511870384216, | |
| "learning_rate": 0.00024357441282748756, | |
| "loss": 4.6299, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3208216727734341, | |
| "grad_norm": 0.990332841873169, | |
| "learning_rate": 0.00024268037672801605, | |
| "loss": 4.6633, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.3231297423617322, | |
| "grad_norm": 0.971305787563324, | |
| "learning_rate": 0.00024178098309599782, | |
| "loss": 4.6453, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.3254378119500303, | |
| "grad_norm": 0.9675345420837402, | |
| "learning_rate": 0.00024087628392227304, | |
| "loss": 4.562, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.3277458815383284, | |
| "grad_norm": 0.9208952188491821, | |
| "learning_rate": 0.000239966331504377, | |
| "loss": 4.576, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.33005395112662644, | |
| "grad_norm": 1.004249930381775, | |
| "learning_rate": 0.00023905117844351674, | |
| "loss": 4.6263, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.33236202071492454, | |
| "grad_norm": 0.9654126763343811, | |
| "learning_rate": 0.0002381308776415307, | |
| "loss": 4.6431, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.33467009030322264, | |
| "grad_norm": 0.9294122457504272, | |
| "learning_rate": 0.0002372054822978304, | |
| "loss": 4.5542, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.33697815989152075, | |
| "grad_norm": 0.8988949060440063, | |
| "learning_rate": 0.00023627504590632517, | |
| "loss": 4.6312, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.3392862294798188, | |
| "grad_norm": 1.0338096618652344, | |
| "learning_rate": 0.00023533962225232992, | |
| "loss": 4.5963, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.3415942990681169, | |
| "grad_norm": 0.9440902471542358, | |
| "learning_rate": 0.00023439926540945604, | |
| "loss": 4.5587, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.343902368656415, | |
| "grad_norm": 1.0818684101104736, | |
| "learning_rate": 0.00023345402973648548, | |
| "loss": 4.5462, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3462104382447131, | |
| "grad_norm": 0.9467376470565796, | |
| "learning_rate": 0.00023250396987422857, | |
| "loss": 4.5969, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3485185078330112, | |
| "grad_norm": 1.009598970413208, | |
| "learning_rate": 0.00023154914074236522, | |
| "loss": 4.5773, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.35082657742130924, | |
| "grad_norm": 1.053017020225525, | |
| "learning_rate": 0.00023058959753627056, | |
| "loss": 4.6051, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.35313464700960734, | |
| "grad_norm": 0.969485342502594, | |
| "learning_rate": 0.0002296253957238239, | |
| "loss": 4.604, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.35544271659790544, | |
| "grad_norm": 1.2008713483810425, | |
| "learning_rate": 0.00022865659104220255, | |
| "loss": 4.5704, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.35775078618620354, | |
| "grad_norm": 1.0302430391311646, | |
| "learning_rate": 0.00022768323949465987, | |
| "loss": 4.5391, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.3600588557745016, | |
| "grad_norm": 0.966468870639801, | |
| "learning_rate": 0.0002267053973472877, | |
| "loss": 4.5363, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.3623669253627997, | |
| "grad_norm": 0.9762535095214844, | |
| "learning_rate": 0.00022572312112576406, | |
| "loss": 4.5587, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.3646749949510978, | |
| "grad_norm": 1.0306516885757446, | |
| "learning_rate": 0.0002247364676120855, | |
| "loss": 4.5296, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.3669830645393959, | |
| "grad_norm": 0.9838928580284119, | |
| "learning_rate": 0.00022374549384128456, | |
| "loss": 4.5947, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.36929113412769393, | |
| "grad_norm": 0.9782153964042664, | |
| "learning_rate": 0.0002227502570981331, | |
| "loss": 4.5091, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.37159920371599203, | |
| "grad_norm": 1.0444267988204956, | |
| "learning_rate": 0.00022175081491383048, | |
| "loss": 4.5221, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.37390727330429013, | |
| "grad_norm": 1.0300132036209106, | |
| "learning_rate": 0.00022074722506267846, | |
| "loss": 4.5312, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.37621534289258823, | |
| "grad_norm": 1.0193167924880981, | |
| "learning_rate": 0.00021973954555874067, | |
| "loss": 4.5706, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.3785234124808863, | |
| "grad_norm": 1.005703091621399, | |
| "learning_rate": 0.00021872783465248978, | |
| "loss": 4.5149, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.3808314820691844, | |
| "grad_norm": 1.0483267307281494, | |
| "learning_rate": 0.00021771215082743968, | |
| "loss": 4.4801, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.3831395516574825, | |
| "grad_norm": 0.9708661437034607, | |
| "learning_rate": 0.00021669255279676514, | |
| "loss": 4.5121, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.3854476212457806, | |
| "grad_norm": 1.0603435039520264, | |
| "learning_rate": 0.00021566909949990746, | |
| "loss": 4.4826, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.3877556908340786, | |
| "grad_norm": 1.1404428482055664, | |
| "learning_rate": 0.0002146418500991678, | |
| "loss": 4.4787, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.3900637604223767, | |
| "grad_norm": 1.0174285173416138, | |
| "learning_rate": 0.00021361086397628682, | |
| "loss": 4.5004, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.3923718300106748, | |
| "grad_norm": 1.1664884090423584, | |
| "learning_rate": 0.0002125762007290121, | |
| "loss": 4.5197, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3946798995989729, | |
| "grad_norm": 0.9837216138839722, | |
| "learning_rate": 0.00021153792016765334, | |
| "loss": 4.5019, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.396987969187271, | |
| "grad_norm": 1.0171154737472534, | |
| "learning_rate": 0.00021049608231162454, | |
| "loss": 4.514, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.3992960387755691, | |
| "grad_norm": 1.0302062034606934, | |
| "learning_rate": 0.00020945074738597447, | |
| "loss": 4.5388, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.40068088052854794, | |
| "eval_loss": 4.705195426940918, | |
| "eval_runtime": 39.444, | |
| "eval_samples_per_second": 633.81, | |
| "eval_steps_per_second": 79.226, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 0.4016041083638672, | |
| "grad_norm": 1.0344492197036743, | |
| "learning_rate": 0.00020840197581790569, | |
| "loss": 4.454, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4039121779521653, | |
| "grad_norm": 1.0174837112426758, | |
| "learning_rate": 0.00020734982823328104, | |
| "loss": 4.4651, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.4062202475404633, | |
| "grad_norm": 0.9471856355667114, | |
| "learning_rate": 0.00020629436545311928, | |
| "loss": 4.5174, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4085283171287614, | |
| "grad_norm": 1.0585298538208008, | |
| "learning_rate": 0.00020523564849007906, | |
| "loss": 4.4544, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.4108363867170595, | |
| "grad_norm": 1.0396827459335327, | |
| "learning_rate": 0.00020417373854493228, | |
| "loss": 4.5077, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.4131444563053576, | |
| "grad_norm": 1.0221539735794067, | |
| "learning_rate": 0.0002031086970030259, | |
| "loss": 4.4515, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.41545252589365567, | |
| "grad_norm": 0.9722737669944763, | |
| "learning_rate": 0.00020204058543073393, | |
| "loss": 4.4483, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.41776059548195377, | |
| "grad_norm": 1.0386008024215698, | |
| "learning_rate": 0.00020096946557189802, | |
| "loss": 4.5063, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.42006866507025187, | |
| "grad_norm": 1.1091365814208984, | |
| "learning_rate": 0.00019989539934425857, | |
| "loss": 4.4913, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.42237673465854997, | |
| "grad_norm": 1.083609700202942, | |
| "learning_rate": 0.0001988184488358754, | |
| "loss": 4.4873, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.42468480424684807, | |
| "grad_norm": 1.084915280342102, | |
| "learning_rate": 0.00019773867630153857, | |
| "loss": 4.4625, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.4269928738351461, | |
| "grad_norm": 0.9942842125892639, | |
| "learning_rate": 0.00019665614415916979, | |
| "loss": 4.435, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4293009434234442, | |
| "grad_norm": 1.0121150016784668, | |
| "learning_rate": 0.00019557091498621416, | |
| "loss": 4.4056, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4316090130117423, | |
| "grad_norm": 1.0252583026885986, | |
| "learning_rate": 0.00019448305151602272, | |
| "loss": 4.3947, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.4339170826000404, | |
| "grad_norm": 1.09006667137146, | |
| "learning_rate": 0.00019339261663422629, | |
| "loss": 4.4671, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.43622515218833846, | |
| "grad_norm": 1.0912604331970215, | |
| "learning_rate": 0.00019229967337510003, | |
| "loss": 4.3903, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.43853322177663656, | |
| "grad_norm": 1.0326118469238281, | |
| "learning_rate": 0.00019120428491791974, | |
| "loss": 4.4382, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.44084129136493466, | |
| "grad_norm": 1.0649503469467163, | |
| "learning_rate": 0.00019010651458330964, | |
| "loss": 4.3955, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.44314936095323276, | |
| "grad_norm": 1.0922999382019043, | |
| "learning_rate": 0.00018900642582958213, | |
| "loss": 4.4406, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.4454574305415308, | |
| "grad_norm": 1.0487009286880493, | |
| "learning_rate": 0.0001879040822490693, | |
| "loss": 4.4296, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.4477655001298289, | |
| "grad_norm": 0.9958457946777344, | |
| "learning_rate": 0.00018679954756444723, | |
| "loss": 4.46, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.450073569718127, | |
| "grad_norm": 1.0039042234420776, | |
| "learning_rate": 0.00018569288562505183, | |
| "loss": 4.4473, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.4523816393064251, | |
| "grad_norm": 1.1141057014465332, | |
| "learning_rate": 0.00018458416040318857, | |
| "loss": 4.4023, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.45468970889472315, | |
| "grad_norm": 1.052263855934143, | |
| "learning_rate": 0.00018347343599043388, | |
| "loss": 4.4455, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.45699777848302126, | |
| "grad_norm": 0.942537784576416, | |
| "learning_rate": 0.00018236077659393077, | |
| "loss": 4.466, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.45930584807131936, | |
| "grad_norm": 1.0139743089675903, | |
| "learning_rate": 0.00018124624653267682, | |
| "loss": 4.4551, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.46161391765961746, | |
| "grad_norm": 1.0442676544189453, | |
| "learning_rate": 0.0001801299102338063, | |
| "loss": 4.4184, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4639219872479155, | |
| "grad_norm": 1.1058298349380493, | |
| "learning_rate": 0.00017901183222886592, | |
| "loss": 4.4478, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.4662300568362136, | |
| "grad_norm": 1.1228169202804565, | |
| "learning_rate": 0.00017789207715008428, | |
| "loss": 4.3777, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.4685381264245117, | |
| "grad_norm": 1.022026777267456, | |
| "learning_rate": 0.0001767707097266359, | |
| "loss": 4.4119, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.4708461960128098, | |
| "grad_norm": 1.0868556499481201, | |
| "learning_rate": 0.0001756477947808994, | |
| "loss": 4.3989, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.47315426560110785, | |
| "grad_norm": 1.0714977979660034, | |
| "learning_rate": 0.00017452339722471026, | |
| "loss": 4.4166, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.47546233518940595, | |
| "grad_norm": 1.0716434717178345, | |
| "learning_rate": 0.0001733975820556086, | |
| "loss": 4.3757, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.47777040477770405, | |
| "grad_norm": 1.0657131671905518, | |
| "learning_rate": 0.00017227041435308177, | |
| "loss": 4.3756, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.48007847436600215, | |
| "grad_norm": 1.0671368837356567, | |
| "learning_rate": 0.00017114195927480256, | |
| "loss": 4.3956, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.4823865439543002, | |
| "grad_norm": 1.0677906274795532, | |
| "learning_rate": 0.00017001228205286236, | |
| "loss": 4.3989, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.4846946135425983, | |
| "grad_norm": 1.0088515281677246, | |
| "learning_rate": 0.00016888144799000047, | |
| "loss": 4.4024, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.4870026831308964, | |
| "grad_norm": 0.9815865755081177, | |
| "learning_rate": 0.0001677495224558293, | |
| "loss": 4.4096, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.4893107527191945, | |
| "grad_norm": 1.1238495111465454, | |
| "learning_rate": 0.00016661657088305526, | |
| "loss": 4.3879, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.4916188223074926, | |
| "grad_norm": 1.0731137990951538, | |
| "learning_rate": 0.0001654826587636967, | |
| "loss": 4.4087, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.49392689189579064, | |
| "grad_norm": 1.1006789207458496, | |
| "learning_rate": 0.0001643478516452977, | |
| "loss": 4.3862, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.49623496148408874, | |
| "grad_norm": 1.0866320133209229, | |
| "learning_rate": 0.00016321221512713928, | |
| "loss": 4.3835, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.49854303107238684, | |
| "grad_norm": 1.1201952695846558, | |
| "learning_rate": 0.00016207581485644707, | |
| "loss": 4.3263, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.500851100660685, | |
| "grad_norm": 1.0500922203063965, | |
| "learning_rate": 0.0001609387165245966, | |
| "loss": 4.3604, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.500851100660685, | |
| "eval_loss": 4.576458930969238, | |
| "eval_runtime": 39.2462, | |
| "eval_samples_per_second": 637.005, | |
| "eval_steps_per_second": 79.626, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.503159170248983, | |
| "grad_norm": 1.1081622838974, | |
| "learning_rate": 0.0001598009858633161, | |
| "loss": 4.3842, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5054672398372811, | |
| "grad_norm": 1.0299861431121826, | |
| "learning_rate": 0.00015866268864088626, | |
| "loss": 4.435, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5077753094255791, | |
| "grad_norm": 1.0571188926696777, | |
| "learning_rate": 0.00015752389065833898, | |
| "loss": 4.3528, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5100833790138772, | |
| "grad_norm": 1.1348025798797607, | |
| "learning_rate": 0.0001563846577456533, | |
| "loss": 4.3108, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.5123914486021753, | |
| "grad_norm": 1.0860552787780762, | |
| "learning_rate": 0.00015524505575794997, | |
| "loss": 4.3618, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5146995181904734, | |
| "grad_norm": 1.085950493812561, | |
| "learning_rate": 0.0001541051505716849, | |
| "loss": 4.3482, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5170075877787715, | |
| "grad_norm": 1.0809770822525024, | |
| "learning_rate": 0.00015296500808084055, | |
| "loss": 4.3486, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5193156573670696, | |
| "grad_norm": 1.1132404804229736, | |
| "learning_rate": 0.00015182469419311754, | |
| "loss": 4.311, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5216237269553677, | |
| "grad_norm": 1.073387861251831, | |
| "learning_rate": 0.00015068427482612393, | |
| "loss": 4.3567, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5239317965436658, | |
| "grad_norm": 1.0589022636413574, | |
| "learning_rate": 0.0001495438159035655, | |
| "loss": 4.3789, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5262398661319638, | |
| "grad_norm": 1.082677960395813, | |
| "learning_rate": 0.00014840338335143452, | |
| "loss": 4.3612, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.5285479357202619, | |
| "grad_norm": 1.0973902940750122, | |
| "learning_rate": 0.0001472630430941987, | |
| "loss": 4.3735, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.53085600530856, | |
| "grad_norm": 1.1433314085006714, | |
| "learning_rate": 0.00014612286105099068, | |
| "loss": 4.3271, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.5331640748968581, | |
| "grad_norm": 1.0663039684295654, | |
| "learning_rate": 0.00014498290313179725, | |
| "loss": 4.3353, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.5354721444851562, | |
| "grad_norm": 1.1327035427093506, | |
| "learning_rate": 0.00014384323523364948, | |
| "loss": 4.3811, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.5377802140734543, | |
| "grad_norm": 1.102273941040039, | |
| "learning_rate": 0.00014270392323681303, | |
| "loss": 4.3814, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.5400882836617524, | |
| "grad_norm": 1.1237622499465942, | |
| "learning_rate": 0.00014156503300098038, | |
| "loss": 4.34, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.5423963532500505, | |
| "grad_norm": 1.1374157667160034, | |
| "learning_rate": 0.00014042663036146344, | |
| "loss": 4.3239, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5447044228383485, | |
| "grad_norm": 1.171049952507019, | |
| "learning_rate": 0.0001392887811253878, | |
| "loss": 4.3335, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.5470124924266466, | |
| "grad_norm": 1.128770112991333, | |
| "learning_rate": 0.00013815155106788865, | |
| "loss": 4.3021, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.5493205620149447, | |
| "grad_norm": 1.1492712497711182, | |
| "learning_rate": 0.00013701500592830878, | |
| "loss": 4.3139, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.5516286316032428, | |
| "grad_norm": 1.1176902055740356, | |
| "learning_rate": 0.00013587921140639805, | |
| "loss": 4.3339, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.5539367011915409, | |
| "grad_norm": 1.106078863143921, | |
| "learning_rate": 0.00013474423315851586, | |
| "loss": 4.314, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.556244770779839, | |
| "grad_norm": 1.0579196214675903, | |
| "learning_rate": 0.00013361013679383553, | |
| "loss": 4.2973, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.5585528403681371, | |
| "grad_norm": 1.0957796573638916, | |
| "learning_rate": 0.0001324769878705518, | |
| "loss": 4.2925, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.5608609099564352, | |
| "grad_norm": 1.1303716897964478, | |
| "learning_rate": 0.000131344851892091, | |
| "loss": 4.338, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.5631689795447333, | |
| "grad_norm": 1.1496975421905518, | |
| "learning_rate": 0.0001302137943033249, | |
| "loss": 4.3075, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.5654770491330313, | |
| "grad_norm": 1.1256930828094482, | |
| "learning_rate": 0.00012908388048678686, | |
| "loss": 4.3234, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.5677851187213294, | |
| "grad_norm": 1.1006488800048828, | |
| "learning_rate": 0.00012795517575889303, | |
| "loss": 4.311, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.5700931883096275, | |
| "grad_norm": 1.223235845565796, | |
| "learning_rate": 0.00012682774536616623, | |
| "loss": 4.3056, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.5724012578979256, | |
| "grad_norm": 1.2225327491760254, | |
| "learning_rate": 0.00012570165448146447, | |
| "loss": 4.3276, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.5747093274862237, | |
| "grad_norm": 1.152992844581604, | |
| "learning_rate": 0.00012457696820021314, | |
| "loss": 4.3058, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.5770173970745218, | |
| "grad_norm": 1.252081036567688, | |
| "learning_rate": 0.00012345375153664264, | |
| "loss": 4.2789, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5793254666628199, | |
| "grad_norm": 1.1459189653396606, | |
| "learning_rate": 0.0001223320694200297, | |
| "loss": 4.3181, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.581633536251118, | |
| "grad_norm": 1.0899156332015991, | |
| "learning_rate": 0.00012121198669094436, | |
| "loss": 4.3692, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 1.060966968536377, | |
| "learning_rate": 0.00012009356809750131, | |
| "loss": 4.3294, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.5862496754277141, | |
| "grad_norm": 1.1345113515853882, | |
| "learning_rate": 0.0001189768782916175, | |
| "loss": 4.3261, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.5885577450160122, | |
| "grad_norm": 1.1231807470321655, | |
| "learning_rate": 0.00011786198182527461, | |
| "loss": 4.3368, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.5908658146043103, | |
| "grad_norm": 1.0552847385406494, | |
| "learning_rate": 0.00011674894314678761, | |
| "loss": 4.2938, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.5931738841926084, | |
| "grad_norm": 1.1445595026016235, | |
| "learning_rate": 0.00011563782659707897, | |
| "loss": 4.3184, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.5954819537809065, | |
| "grad_norm": 1.1536098718643188, | |
| "learning_rate": 0.00011452869640595975, | |
| "loss": 4.3189, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.5977900233692046, | |
| "grad_norm": 1.2136541604995728, | |
| "learning_rate": 0.00011342161668841641, | |
| "loss": 4.2195, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6000980929575027, | |
| "grad_norm": 1.1163119077682495, | |
| "learning_rate": 0.00011231665144090456, | |
| "loss": 4.2419, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6010213207928219, | |
| "eval_loss": 4.489214897155762, | |
| "eval_runtime": 39.3697, | |
| "eval_samples_per_second": 635.006, | |
| "eval_steps_per_second": 79.376, | |
| "step": 2604 | |
| }, | |
| { | |
| "epoch": 0.6024061625458007, | |
| "grad_norm": 1.4087986946105957, | |
| "learning_rate": 0.0001112138645376496, | |
| "loss": 4.2601, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6047142321340988, | |
| "grad_norm": 1.19622802734375, | |
| "learning_rate": 0.00011011331972695449, | |
| "loss": 4.296, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6070223017223969, | |
| "grad_norm": 1.1068109273910522, | |
| "learning_rate": 0.00010901508062751438, | |
| "loss": 4.2879, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.609330371310695, | |
| "grad_norm": 1.156851887702942, | |
| "learning_rate": 0.00010791921072473941, | |
| "loss": 4.2653, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.6116384408989931, | |
| "grad_norm": 1.1451683044433594, | |
| "learning_rate": 0.00010682577336708449, | |
| "loss": 4.2987, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6139465104872912, | |
| "grad_norm": 1.133888840675354, | |
| "learning_rate": 0.00010573483176238752, | |
| "loss": 4.2558, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.6162545800755893, | |
| "grad_norm": 1.1750303506851196, | |
| "learning_rate": 0.00010464644897421561, | |
| "loss": 4.3379, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.6185626496638874, | |
| "grad_norm": 1.1152632236480713, | |
| "learning_rate": 0.00010356068791821953, | |
| "loss": 4.2346, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.6208707192521854, | |
| "grad_norm": 1.167487382888794, | |
| "learning_rate": 0.0001024776113584966, | |
| "loss": 4.2805, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.6231787888404835, | |
| "grad_norm": 1.1430394649505615, | |
| "learning_rate": 0.00010139728190396288, | |
| "loss": 4.2433, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.6254868584287816, | |
| "grad_norm": 1.181851863861084, | |
| "learning_rate": 0.00010031976200473364, | |
| "loss": 4.2759, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.6277949280170797, | |
| "grad_norm": 1.152464509010315, | |
| "learning_rate": 9.92451139485136e-05, | |
| "loss": 4.2761, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.6301029976053778, | |
| "grad_norm": 1.1081931591033936, | |
| "learning_rate": 9.817339985699593e-05, | |
| "loss": 4.2457, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.6324110671936759, | |
| "grad_norm": 1.1744070053100586, | |
| "learning_rate": 9.710468168227158e-05, | |
| "loss": 4.2863, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.634719136781974, | |
| "grad_norm": 1.218344807624817, | |
| "learning_rate": 9.60390212032479e-05, | |
| "loss": 4.2711, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6370272063702721, | |
| "grad_norm": 1.1617342233657837, | |
| "learning_rate": 9.497648002207745e-05, | |
| "loss": 4.2289, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.6393352759585702, | |
| "grad_norm": 1.1277827024459839, | |
| "learning_rate": 9.391711956059675e-05, | |
| "loss": 4.2894, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.6416433455468682, | |
| "grad_norm": 1.2645858526229858, | |
| "learning_rate": 9.286100105677608e-05, | |
| "loss": 4.2934, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.6439514151351663, | |
| "grad_norm": 1.1396396160125732, | |
| "learning_rate": 9.180818556117931e-05, | |
| "loss": 4.2627, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.6462594847234644, | |
| "grad_norm": 1.1112399101257324, | |
| "learning_rate": 9.075873393343487e-05, | |
| "loss": 4.2799, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.6485675543117625, | |
| "grad_norm": 1.1682491302490234, | |
| "learning_rate": 8.971270683871736e-05, | |
| "loss": 4.2557, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.6508756239000606, | |
| "grad_norm": 1.1381292343139648, | |
| "learning_rate": 8.867016474424121e-05, | |
| "loss": 4.2468, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.6531836934883587, | |
| "grad_norm": 1.2451766729354858, | |
| "learning_rate": 8.763116791576497e-05, | |
| "loss": 4.3402, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.6554917630766568, | |
| "grad_norm": 1.2092978954315186, | |
| "learning_rate": 8.659577641410756e-05, | |
| "loss": 4.2999, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.6577998326649549, | |
| "grad_norm": 1.1491693258285522, | |
| "learning_rate": 8.556405009167627e-05, | |
| "loss": 4.2427, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.6601079022532529, | |
| "grad_norm": 1.166319489479065, | |
| "learning_rate": 8.453604858900736e-05, | |
| "loss": 4.2599, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.662415971841551, | |
| "grad_norm": 1.1869049072265625, | |
| "learning_rate": 8.351183133131778e-05, | |
| "loss": 4.2849, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.6647240414298491, | |
| "grad_norm": 1.1337544918060303, | |
| "learning_rate": 8.24914575250707e-05, | |
| "loss": 4.2321, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.6670321110181472, | |
| "grad_norm": 1.1453369855880737, | |
| "learning_rate": 8.147498615455221e-05, | |
| "loss": 4.2508, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.6693401806064453, | |
| "grad_norm": 1.2037177085876465, | |
| "learning_rate": 8.046247597846244e-05, | |
| "loss": 4.2616, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.6716482501947434, | |
| "grad_norm": 1.2716485261917114, | |
| "learning_rate": 7.945398552651837e-05, | |
| "loss": 4.2711, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.6739563197830415, | |
| "grad_norm": 1.1850802898406982, | |
| "learning_rate": 7.844957309607061e-05, | |
| "loss": 4.254, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.6762643893713396, | |
| "grad_norm": 1.1652292013168335, | |
| "learning_rate": 7.744929674873344e-05, | |
| "loss": 4.2528, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.6785724589596376, | |
| "grad_norm": 1.1236425638198853, | |
| "learning_rate": 7.645321430702854e-05, | |
| "loss": 4.2309, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.6808805285479357, | |
| "grad_norm": 1.1567282676696777, | |
| "learning_rate": 7.546138335104229e-05, | |
| "loss": 4.2226, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.6831885981362338, | |
| "grad_norm": 1.2129831314086914, | |
| "learning_rate": 7.447386121509741e-05, | |
| "loss": 4.2682, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.6854966677245319, | |
| "grad_norm": 1.1564743518829346, | |
| "learning_rate": 7.349070498443857e-05, | |
| "loss": 4.2495, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.68780473731283, | |
| "grad_norm": 1.230202317237854, | |
| "learning_rate": 7.251197149193251e-05, | |
| "loss": 4.2339, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.6901128069011281, | |
| "grad_norm": 1.1715277433395386, | |
| "learning_rate": 7.153771731478289e-05, | |
| "loss": 4.2226, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.6924208764894262, | |
| "grad_norm": 1.2347677946090698, | |
| "learning_rate": 7.05679987712595e-05, | |
| "loss": 4.227, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6947289460777243, | |
| "grad_norm": 1.19216787815094, | |
| "learning_rate": 6.96028719174428e-05, | |
| "loss": 4.2868, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.6970370156660224, | |
| "grad_norm": 1.2830464839935303, | |
| "learning_rate": 6.864239254398352e-05, | |
| "loss": 4.2326, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.6993450852543204, | |
| "grad_norm": 1.2899502515792847, | |
| "learning_rate": 6.76866161728778e-05, | |
| "loss": 4.2616, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.7011915409249588, | |
| "eval_loss": 4.435102462768555, | |
| "eval_runtime": 39.3746, | |
| "eval_samples_per_second": 634.927, | |
| "eval_steps_per_second": 79.366, | |
| "step": 3038 | |
| }, | |
| { | |
| "epoch": 0.7016531548426185, | |
| "grad_norm": 1.1735284328460693, | |
| "learning_rate": 6.67355980542571e-05, | |
| "loss": 4.2741, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.7039612244309166, | |
| "grad_norm": 1.2111891508102417, | |
| "learning_rate": 6.578939316319502e-05, | |
| "loss": 4.2271, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7062692940192147, | |
| "grad_norm": 1.188081979751587, | |
| "learning_rate": 6.484805619652893e-05, | |
| "loss": 4.2188, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.7085773636075128, | |
| "grad_norm": 1.285130500793457, | |
| "learning_rate": 6.391164156969856e-05, | |
| "loss": 4.193, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.7108854331958109, | |
| "grad_norm": 1.1954678297042847, | |
| "learning_rate": 6.298020341359972e-05, | |
| "loss": 4.211, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.713193502784109, | |
| "grad_norm": 1.1639289855957031, | |
| "learning_rate": 6.205379557145607e-05, | |
| "loss": 4.2421, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.7155015723724071, | |
| "grad_norm": 1.1945711374282837, | |
| "learning_rate": 6.113247159570591e-05, | |
| "loss": 4.2843, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.7178096419607051, | |
| "grad_norm": 1.2462584972381592, | |
| "learning_rate": 6.0216284744907036e-05, | |
| "loss": 4.2239, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.7201177115490032, | |
| "grad_norm": 1.2085964679718018, | |
| "learning_rate": 5.930528798065741e-05, | |
| "loss": 4.2027, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.7224257811373013, | |
| "grad_norm": 1.1740282773971558, | |
| "learning_rate": 5.839953396453442e-05, | |
| "loss": 4.2056, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.7247338507255994, | |
| "grad_norm": 1.1846504211425781, | |
| "learning_rate": 5.749907505504999e-05, | |
| "loss": 4.2292, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.7270419203138975, | |
| "grad_norm": 1.3088330030441284, | |
| "learning_rate": 5.660396330462448e-05, | |
| "loss": 4.2503, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.7293499899021956, | |
| "grad_norm": 1.2662092447280884, | |
| "learning_rate": 5.571425045657711e-05, | |
| "loss": 4.2536, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.7316580594904937, | |
| "grad_norm": 1.1370505094528198, | |
| "learning_rate": 5.4829987942135495e-05, | |
| "loss": 4.1551, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.7339661290787918, | |
| "grad_norm": 1.244957447052002, | |
| "learning_rate": 5.395122687746217e-05, | |
| "loss": 4.2295, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.7362741986670898, | |
| "grad_norm": 1.2070027589797974, | |
| "learning_rate": 5.3078018060699836e-05, | |
| "loss": 4.2056, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.7385822682553879, | |
| "grad_norm": 1.2402708530426025, | |
| "learning_rate": 5.221041196903489e-05, | |
| "loss": 4.243, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.740890337843686, | |
| "grad_norm": 1.2046093940734863, | |
| "learning_rate": 5.1348458755779706e-05, | |
| "loss": 4.2083, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.7431984074319841, | |
| "grad_norm": 1.2073742151260376, | |
| "learning_rate": 5.049220824747306e-05, | |
| "loss": 4.2024, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.7455064770202822, | |
| "grad_norm": 1.1986374855041504, | |
| "learning_rate": 4.964170994100019e-05, | |
| "loss": 4.2975, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.7478145466085803, | |
| "grad_norm": 1.2024635076522827, | |
| "learning_rate": 4.879701300073134e-05, | |
| "loss": 4.2499, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.7501226161968784, | |
| "grad_norm": 1.20681893825531, | |
| "learning_rate": 4.7958166255679787e-05, | |
| "loss": 4.2109, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.7524306857851765, | |
| "grad_norm": 1.2092540264129639, | |
| "learning_rate": 4.712521819667936e-05, | |
| "loss": 4.2221, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.7547387553734745, | |
| "grad_norm": 1.1802047491073608, | |
| "learning_rate": 4.629821697358108e-05, | |
| "loss": 4.202, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.7570468249617726, | |
| "grad_norm": 1.237534999847412, | |
| "learning_rate": 4.5477210392469944e-05, | |
| "loss": 4.2039, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.7593548945500707, | |
| "grad_norm": 1.212430715560913, | |
| "learning_rate": 4.4662245912901364e-05, | |
| "loss": 4.2043, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.7616629641383688, | |
| "grad_norm": 1.2303744554519653, | |
| "learning_rate": 4.38533706451579e-05, | |
| "loss": 4.2249, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.7639710337266669, | |
| "grad_norm": 1.2151821851730347, | |
| "learning_rate": 4.305063134752559e-05, | |
| "loss": 4.2416, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.766279103314965, | |
| "grad_norm": 1.1624999046325684, | |
| "learning_rate": 4.225407442359134e-05, | |
| "loss": 4.248, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.7685871729032631, | |
| "grad_norm": 1.1886614561080933, | |
| "learning_rate": 4.1463745919560296e-05, | |
| "loss": 4.1549, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.7708952424915612, | |
| "grad_norm": 1.141176700592041, | |
| "learning_rate": 4.067969152159433e-05, | |
| "loss": 4.1967, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.7732033120798593, | |
| "grad_norm": 1.1527976989746094, | |
| "learning_rate": 3.9901956553170714e-05, | |
| "loss": 4.193, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.7755113816681573, | |
| "grad_norm": 1.2025054693222046, | |
| "learning_rate": 3.913058597246242e-05, | |
| "loss": 4.1946, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.7778194512564554, | |
| "grad_norm": 1.13848078250885, | |
| "learning_rate": 3.836562436973906e-05, | |
| "loss": 4.1719, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.7801275208447535, | |
| "grad_norm": 1.2138718366622925, | |
| "learning_rate": 3.7607115964789537e-05, | |
| "loss": 4.2069, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.7824355904330516, | |
| "grad_norm": 1.1834176778793335, | |
| "learning_rate": 3.6855104604365485e-05, | |
| "loss": 4.2246, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.7847436600213497, | |
| "grad_norm": 1.157386064529419, | |
| "learning_rate": 3.610963375964694e-05, | |
| "loss": 4.2147, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.7870517296096478, | |
| "grad_norm": 1.251615285873413, | |
| "learning_rate": 3.5370746523729215e-05, | |
| "loss": 4.2354, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.7893597991979459, | |
| "grad_norm": 1.279531478881836, | |
| "learning_rate": 3.463848560913199e-05, | |
| "loss": 4.2083, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.791667868786244, | |
| "grad_norm": 1.2485535144805908, | |
| "learning_rate": 3.391289334533026e-05, | |
| "loss": 4.1657, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.793975938374542, | |
| "grad_norm": 1.2322598695755005, | |
| "learning_rate": 3.3194011676307234e-05, | |
| "loss": 4.1474, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.79628400796284, | |
| "grad_norm": 1.2795343399047852, | |
| "learning_rate": 3.248188215812985e-05, | |
| "loss": 4.1557, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.7985920775511381, | |
| "grad_norm": 1.236671805381775, | |
| "learning_rate": 3.1776545956546473e-05, | |
| "loss": 4.1628, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.8009001471394362, | |
| "grad_norm": 1.163258671760559, | |
| "learning_rate": 3.107804384460745e-05, | |
| "loss": 4.2085, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.8013617610570959, | |
| "eval_loss": 4.39634370803833, | |
| "eval_runtime": 39.8062, | |
| "eval_samples_per_second": 628.043, | |
| "eval_steps_per_second": 78.505, | |
| "step": 3472 | |
| }, | |
| { | |
| "epoch": 0.8032082167277343, | |
| "grad_norm": 1.2136709690093994, | |
| "learning_rate": 3.0386416200307772e-05, | |
| "loss": 4.2476, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.8055162863160324, | |
| "grad_norm": 1.214560866355896, | |
| "learning_rate": 2.970170300425341e-05, | |
| "loss": 4.1994, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.8078243559043305, | |
| "grad_norm": 1.2116317749023438, | |
| "learning_rate": 2.9023943837349795e-05, | |
| "loss": 4.1864, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8101324254926286, | |
| "grad_norm": 1.2190097570419312, | |
| "learning_rate": 2.835317787851411e-05, | |
| "loss": 4.2019, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.8124404950809266, | |
| "grad_norm": 1.2359529733657837, | |
| "learning_rate": 2.768944390241012e-05, | |
| "loss": 4.1716, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.8147485646692247, | |
| "grad_norm": 1.255321979522705, | |
| "learning_rate": 2.703278027720713e-05, | |
| "loss": 4.1866, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.8170566342575228, | |
| "grad_norm": 1.2041914463043213, | |
| "learning_rate": 2.6383224962361766e-05, | |
| "loss": 4.2161, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.8193647038458209, | |
| "grad_norm": 1.2864853143692017, | |
| "learning_rate": 2.5740815506423917e-05, | |
| "loss": 4.1654, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.821672773434119, | |
| "grad_norm": 1.3623309135437012, | |
| "learning_rate": 2.51055890448658e-05, | |
| "loss": 4.2003, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.8239808430224171, | |
| "grad_norm": 1.291591763496399, | |
| "learning_rate": 2.44775822979358e-05, | |
| "loss": 4.159, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.8262889126107152, | |
| "grad_norm": 1.2855194807052612, | |
| "learning_rate": 2.3856831568535307e-05, | |
| "loss": 4.1886, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.8285969821990133, | |
| "grad_norm": 1.2652373313903809, | |
| "learning_rate": 2.324337274012061e-05, | |
| "loss": 4.1722, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.8309050517873113, | |
| "grad_norm": 1.2906244993209839, | |
| "learning_rate": 2.2637241274628108e-05, | |
| "loss": 4.1888, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.8332131213756094, | |
| "grad_norm": 1.2488876581192017, | |
| "learning_rate": 2.2038472210424952e-05, | |
| "loss": 4.2159, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.8355211909639075, | |
| "grad_norm": 1.2642358541488647, | |
| "learning_rate": 2.1447100160283082e-05, | |
| "loss": 4.1982, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.8378292605522056, | |
| "grad_norm": 1.2974900007247925, | |
| "learning_rate": 2.0863159309378657e-05, | |
| "loss": 4.2046, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.8401373301405037, | |
| "grad_norm": 1.2382539510726929, | |
| "learning_rate": 2.0286683413315873e-05, | |
| "loss": 4.1495, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.8424453997288018, | |
| "grad_norm": 1.25460946559906, | |
| "learning_rate": 1.9717705796175727e-05, | |
| "loss": 4.2023, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.8447534693170999, | |
| "grad_norm": 1.22829270362854, | |
| "learning_rate": 1.9156259348589514e-05, | |
| "loss": 4.1346, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.847061538905398, | |
| "grad_norm": 1.2403064966201782, | |
| "learning_rate": 1.8602376525837655e-05, | |
| "loss": 4.1988, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.8493696084936961, | |
| "grad_norm": 1.2172107696533203, | |
| "learning_rate": 1.8056089345973536e-05, | |
| "loss": 4.2222, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.8516776780819941, | |
| "grad_norm": 1.2095916271209717, | |
| "learning_rate": 1.7517429387972608e-05, | |
| "loss": 4.1647, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.8539857476702922, | |
| "grad_norm": 1.2624644041061401, | |
| "learning_rate": 1.6986427789907115e-05, | |
| "loss": 4.2337, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.8562938172585903, | |
| "grad_norm": 1.2449021339416504, | |
| "learning_rate": 1.6463115247145782e-05, | |
| "loss": 4.1926, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.8586018868468884, | |
| "grad_norm": 1.2478352785110474, | |
| "learning_rate": 1.594752201057968e-05, | |
| "loss": 4.1702, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.8609099564351865, | |
| "grad_norm": 1.263993740081787, | |
| "learning_rate": 1.5439677884873424e-05, | |
| "loss": 4.1817, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.8632180260234846, | |
| "grad_norm": 1.1771023273468018, | |
| "learning_rate": 1.4939612226742347e-05, | |
| "loss": 4.1884, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.8655260956117827, | |
| "grad_norm": 1.2805135250091553, | |
| "learning_rate": 1.4447353943255341e-05, | |
| "loss": 4.1973, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.8678341652000808, | |
| "grad_norm": 1.2390002012252808, | |
| "learning_rate": 1.3962931490163992e-05, | |
| "loss": 4.163, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.8701422347883788, | |
| "grad_norm": 1.1928465366363525, | |
| "learning_rate": 1.3486372870257539e-05, | |
| "loss": 4.2661, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.8724503043766769, | |
| "grad_norm": 1.227087378501892, | |
| "learning_rate": 1.3017705631744263e-05, | |
| "loss": 4.1941, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.874758373964975, | |
| "grad_norm": 1.2422763109207153, | |
| "learning_rate": 1.255695686665883e-05, | |
| "loss": 4.1729, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.8770664435532731, | |
| "grad_norm": 1.2221300601959229, | |
| "learning_rate": 1.2104153209296374e-05, | |
| "loss": 4.1766, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.8793745131415712, | |
| "grad_norm": 1.2882457971572876, | |
| "learning_rate": 1.1659320834672753e-05, | |
| "loss": 4.2104, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.8816825827298693, | |
| "grad_norm": 1.3336913585662842, | |
| "learning_rate": 1.1222485457011516e-05, | |
| "loss": 4.2294, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.8839906523181674, | |
| "grad_norm": 1.1365299224853516, | |
| "learning_rate": 1.079367232825743e-05, | |
| "loss": 4.1763, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.8862987219064655, | |
| "grad_norm": 1.3608059883117676, | |
| "learning_rate": 1.0372906236616734e-05, | |
| "loss": 4.2236, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.8886067914947635, | |
| "grad_norm": 1.3224635124206543, | |
| "learning_rate": 9.960211505124215e-06, | |
| "loss": 4.1455, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.8909148610830616, | |
| "grad_norm": 1.2558730840682983, | |
| "learning_rate": 9.555611990237317e-06, | |
| "loss": 4.1669, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.8932229306713597, | |
| "grad_norm": 1.2785941362380981, | |
| "learning_rate": 9.159131080456839e-06, | |
| "loss": 4.1952, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.8955310002596578, | |
| "grad_norm": 1.2829550504684448, | |
| "learning_rate": 8.770791694975093e-06, | |
| "loss": 4.1654, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.8978390698479559, | |
| "grad_norm": 1.3000894784927368, | |
| "learning_rate": 8.390616282350992e-06, | |
| "loss": 4.223, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.900147139436254, | |
| "grad_norm": 1.4616061449050903, | |
| "learning_rate": 8.018626819212354e-06, | |
| "loss": 4.1981, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.9015319811892328, | |
| "eval_loss": 4.381021022796631, | |
| "eval_runtime": 39.549, | |
| "eval_samples_per_second": 632.127, | |
| "eval_steps_per_second": 79.016, | |
| "step": 3906 | |
| }, | |
| { | |
| "epoch": 0.9024552090245521, | |
| "grad_norm": 1.295082449913025, | |
| "learning_rate": 7.654844808985617e-06, | |
| "loss": 4.1607, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.9047632786128502, | |
| "grad_norm": 1.3329360485076904, | |
| "learning_rate": 7.299291280652503e-06, | |
| "loss": 4.1855, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.9070713482011483, | |
| "grad_norm": 1.2815297842025757, | |
| "learning_rate": 6.951986787534824e-06, | |
| "loss": 4.2036, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.9093794177894463, | |
| "grad_norm": 1.2049307823181152, | |
| "learning_rate": 6.612951406106015e-06, | |
| "loss": 4.1678, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.9116874873777444, | |
| "grad_norm": 1.3223621845245361, | |
| "learning_rate": 6.282204734830826e-06, | |
| "loss": 4.1758, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.9139955569660425, | |
| "grad_norm": 1.3526756763458252, | |
| "learning_rate": 5.959765893032131e-06, | |
| "loss": 4.1829, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.9163036265543406, | |
| "grad_norm": 1.3002768754959106, | |
| "learning_rate": 5.645653519786064e-06, | |
| "loss": 4.1908, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.9186116961426387, | |
| "grad_norm": 1.294180989265442, | |
| "learning_rate": 5.339885772844227e-06, | |
| "loss": 4.1767, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.9209197657309368, | |
| "grad_norm": 1.2366904020309448, | |
| "learning_rate": 5.042480327584231e-06, | |
| "loss": 4.1572, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.9232278353192349, | |
| "grad_norm": 1.2950676679611206, | |
| "learning_rate": 4.753454375987898e-06, | |
| "loss": 4.1748, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.925535904907533, | |
| "grad_norm": 1.3021730184555054, | |
| "learning_rate": 4.472824625647503e-06, | |
| "loss": 4.1417, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.927843974495831, | |
| "grad_norm": 1.3200892210006714, | |
| "learning_rate": 4.2006072987998355e-06, | |
| "loss": 4.1742, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.9301520440841291, | |
| "grad_norm": 1.2840920686721802, | |
| "learning_rate": 3.9368181313886085e-06, | |
| "loss": 4.1808, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.9324601136724272, | |
| "grad_norm": 1.2291038036346436, | |
| "learning_rate": 3.681472372154659e-06, | |
| "loss": 4.2205, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.9347681832607253, | |
| "grad_norm": 1.2467334270477295, | |
| "learning_rate": 3.434584781754668e-06, | |
| "loss": 4.2397, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.9370762528490234, | |
| "grad_norm": 1.292160153388977, | |
| "learning_rate": 3.196169631907658e-06, | |
| "loss": 4.1912, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.9393843224373215, | |
| "grad_norm": 1.3593335151672363, | |
| "learning_rate": 2.966240704570205e-06, | |
| "loss": 4.1743, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.9416923920256196, | |
| "grad_norm": 1.2135837078094482, | |
| "learning_rate": 2.7448112911396257e-06, | |
| "loss": 4.1725, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.9440004616139177, | |
| "grad_norm": 1.2826229333877563, | |
| "learning_rate": 2.5318941916857236e-06, | |
| "loss": 4.1779, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.9463085312022157, | |
| "grad_norm": 1.3116379976272583, | |
| "learning_rate": 2.327501714210783e-06, | |
| "loss": 4.1753, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.9486166007905138, | |
| "grad_norm": 1.2221295833587646, | |
| "learning_rate": 2.1316456739381373e-06, | |
| "loss": 4.1947, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.9509246703788119, | |
| "grad_norm": 1.2066534757614136, | |
| "learning_rate": 1.9443373926291806e-06, | |
| "loss": 4.1914, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.95323273996711, | |
| "grad_norm": 1.2396924495697021, | |
| "learning_rate": 1.765587697928844e-06, | |
| "loss": 4.1717, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.9555408095554081, | |
| "grad_norm": 1.2560229301452637, | |
| "learning_rate": 1.5954069227397782e-06, | |
| "loss": 4.1566, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.9578488791437062, | |
| "grad_norm": 1.4303022623062134, | |
| "learning_rate": 1.43380490462493e-06, | |
| "loss": 4.1762, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.9601569487320043, | |
| "grad_norm": 1.2595306634902954, | |
| "learning_rate": 1.2807909852389498e-06, | |
| "loss": 4.1753, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.9624650183203024, | |
| "grad_norm": 1.1923511028289795, | |
| "learning_rate": 1.1363740097881547e-06, | |
| "loss": 4.1417, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.9647730879086004, | |
| "grad_norm": 1.1929636001586914, | |
| "learning_rate": 1.0005623265192053e-06, | |
| "loss": 4.1698, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.9670811574968985, | |
| "grad_norm": 1.2409882545471191, | |
| "learning_rate": 8.733637862365251e-07, | |
| "loss": 4.1773, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.9693892270851966, | |
| "grad_norm": 1.2641384601593018, | |
| "learning_rate": 7.547857418485459e-07, | |
| "loss": 4.1271, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.9716972966734947, | |
| "grad_norm": 1.1365667581558228, | |
| "learning_rate": 6.448350479425157e-07, | |
| "loss": 4.14, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.9740053662617928, | |
| "grad_norm": 1.2669204473495483, | |
| "learning_rate": 5.435180603884148e-07, | |
| "loss": 4.1989, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.9763134358500909, | |
| "grad_norm": 1.2706636190414429, | |
| "learning_rate": 4.508406359714001e-07, | |
| "loss": 4.1727, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.978621505438389, | |
| "grad_norm": 1.293450951576233, | |
| "learning_rate": 3.6680813205339223e-07, | |
| "loss": 4.1872, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.9809295750266871, | |
| "grad_norm": 1.2493462562561035, | |
| "learning_rate": 2.9142540626325707e-07, | |
| "loss": 4.1937, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.9832376446149852, | |
| "grad_norm": 1.265554428100586, | |
| "learning_rate": 2.246968162160634e-07, | |
| "loss": 4.1332, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.9855457142032832, | |
| "grad_norm": 1.2552322149276733, | |
| "learning_rate": 1.6662621926118446e-07, | |
| "loss": 4.221, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.9878537837915813, | |
| "grad_norm": 1.2385802268981934, | |
| "learning_rate": 1.1721697225932636e-07, | |
| "loss": 4.1898, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.9901618533798794, | |
| "grad_norm": 1.358494520187378, | |
| "learning_rate": 7.647193138843322e-08, | |
| "loss": 4.1357, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.9924699229681775, | |
| "grad_norm": 1.174631953239441, | |
| "learning_rate": 4.439345197861932e-08, | |
| "loss": 4.1818, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.9947779925564756, | |
| "grad_norm": 1.249579668045044, | |
| "learning_rate": 2.0983388376011324e-08, | |
| "loss": 4.1376, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.9970860621447737, | |
| "grad_norm": 1.2420294284820557, | |
| "learning_rate": 6.243093835567314e-09, | |
| "loss": 4.1601, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.9993941317330718, | |
| "grad_norm": 1.2425047159194946, | |
| "learning_rate": 1.734204427727981e-10, | |
| "loss": 4.1714, | |
| "step": 4330 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4332, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 434, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2436421205975040.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |