{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 42276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002838489923360772, "grad_norm": 5.731553554534912, "learning_rate": 3.8000000000000005e-06, "loss": 2.7135, "step": 20 }, { "epoch": 0.005676979846721544, "grad_norm": 5.142412185668945, "learning_rate": 7.800000000000002e-06, "loss": 2.6255, "step": 40 }, { "epoch": 0.008515469770082317, "grad_norm": 4.703895092010498, "learning_rate": 1.18e-05, "loss": 2.6593, "step": 60 }, { "epoch": 0.011353959693443088, "grad_norm": 4.917039394378662, "learning_rate": 1.58e-05, "loss": 2.5456, "step": 80 }, { "epoch": 0.01419244961680386, "grad_norm": 5.939738750457764, "learning_rate": 1.98e-05, "loss": 2.5046, "step": 100 }, { "epoch": 0.017030939540164634, "grad_norm": 4.615878582000732, "learning_rate": 1.9999989985111882e-05, "loss": 2.5399, "step": 120 }, { "epoch": 0.019869429463525403, "grad_norm": 4.4576215744018555, "learning_rate": 1.999995780433058e-05, "loss": 2.5118, "step": 140 }, { "epoch": 0.022707919386886176, "grad_norm": 4.865583896636963, "learning_rate": 1.9999903429985447e-05, "loss": 2.5063, "step": 160 }, { "epoch": 0.02554640931024695, "grad_norm": 4.703500270843506, "learning_rate": 1.9999826862197157e-05, "loss": 2.5294, "step": 180 }, { "epoch": 0.02838489923360772, "grad_norm": 4.76619291305542, "learning_rate": 1.9999728101135637e-05, "loss": 2.5099, "step": 200 }, { "epoch": 0.031223389156968494, "grad_norm": 4.474153518676758, "learning_rate": 1.9999607147020074e-05, "loss": 2.5004, "step": 220 }, { "epoch": 0.03406187908032927, "grad_norm": 4.568985939025879, "learning_rate": 1.999946400011892e-05, "loss": 2.4972, "step": 240 }, { "epoch": 0.03690036900369004, "grad_norm": 4.54977560043335, "learning_rate": 1.9999298660749854e-05, "loss": 2.4664, "step": 260 }, { "epoch": 0.039738858927050806, "grad_norm": 4.343109130859375, "learning_rate": 1.9999111129279837e-05, "loss": 2.4562, "step": 280 }, { "epoch": 0.04257734885041158, "grad_norm": 4.553160190582275, "learning_rate": 1.999890140612507e-05, "loss": 2.4779, "step": 300 }, { "epoch": 0.04541583877377235, "grad_norm": 4.5021891593933105, "learning_rate": 1.9998669491750996e-05, "loss": 2.4641, "step": 320 }, { "epoch": 0.04825432869713313, "grad_norm": 3.759829044342041, "learning_rate": 1.999841538667233e-05, "loss": 2.4064, "step": 340 }, { "epoch": 0.0510928186204939, "grad_norm": 4.030900001525879, "learning_rate": 1.999813909145301e-05, "loss": 2.4956, "step": 360 }, { "epoch": 0.05393130854385467, "grad_norm": 3.918046474456787, "learning_rate": 1.9997840606706247e-05, "loss": 2.4357, "step": 380 }, { "epoch": 0.05676979846721544, "grad_norm": 4.04481840133667, "learning_rate": 1.9997519933094485e-05, "loss": 2.4628, "step": 400 }, { "epoch": 0.05960828839057621, "grad_norm": 4.3381123542785645, "learning_rate": 1.9997177071329413e-05, "loss": 2.4263, "step": 420 }, { "epoch": 0.06244677831393699, "grad_norm": 4.07150411605835, "learning_rate": 1.9996812022171973e-05, "loss": 2.4416, "step": 440 }, { "epoch": 0.06528526823729776, "grad_norm": 3.892810821533203, "learning_rate": 1.9996424786432337e-05, "loss": 2.4994, "step": 460 }, { "epoch": 0.06812375816065853, "grad_norm": 3.7170569896698, "learning_rate": 1.9996015364969926e-05, "loss": 2.3641, "step": 480 }, { "epoch": 0.0709622480840193, "grad_norm": 4.001615047454834, "learning_rate": 1.9995583758693393e-05, "loss": 2.3916, "step": 500 }, { "epoch": 0.07380073800738007, "grad_norm": 3.5111935138702393, "learning_rate": 1.999512996856063e-05, "loss": 2.3917, "step": 520 }, { "epoch": 0.07663922793074085, "grad_norm": 3.6010093688964844, "learning_rate": 1.999465399557877e-05, "loss": 2.3832, "step": 540 }, { "epoch": 0.07947771785410161, "grad_norm": 3.170520305633545, "learning_rate": 1.999415584080416e-05, "loss": 2.3565, "step": 560 }, { "epoch": 0.08231620777746239, "grad_norm": 3.6485605239868164, "learning_rate": 1.9993635505342398e-05, "loss": 2.4518, "step": 580 }, { "epoch": 0.08515469770082316, "grad_norm": 4.3825883865356445, "learning_rate": 1.9993092990348294e-05, "loss": 2.3855, "step": 600 }, { "epoch": 0.08799318762418393, "grad_norm": 3.575106620788574, "learning_rate": 1.9992528297025888e-05, "loss": 2.432, "step": 620 }, { "epoch": 0.0908316775475447, "grad_norm": 3.671234130859375, "learning_rate": 1.9991941426628442e-05, "loss": 2.4236, "step": 640 }, { "epoch": 0.09367016747090548, "grad_norm": 3.577385902404785, "learning_rate": 1.999133238045843e-05, "loss": 2.3968, "step": 660 }, { "epoch": 0.09650865739426626, "grad_norm": 3.3824257850646973, "learning_rate": 1.9990701159867556e-05, "loss": 2.3547, "step": 680 }, { "epoch": 0.09934714731762702, "grad_norm": 3.61997127532959, "learning_rate": 1.9990047766256726e-05, "loss": 2.371, "step": 700 }, { "epoch": 0.1021856372409878, "grad_norm": 3.4755024909973145, "learning_rate": 1.998937220107606e-05, "loss": 2.3458, "step": 720 }, { "epoch": 0.10502412716434857, "grad_norm": 3.4415810108184814, "learning_rate": 1.998867446582488e-05, "loss": 2.3694, "step": 740 }, { "epoch": 0.10786261708770933, "grad_norm": 3.7888681888580322, "learning_rate": 1.9987954562051724e-05, "loss": 2.3251, "step": 760 }, { "epoch": 0.11070110701107011, "grad_norm": 3.5540387630462646, "learning_rate": 1.9987212491354316e-05, "loss": 2.3902, "step": 780 }, { "epoch": 0.11353959693443089, "grad_norm": 3.558457851409912, "learning_rate": 1.9986448255379587e-05, "loss": 2.4252, "step": 800 }, { "epoch": 0.11637808685779165, "grad_norm": 3.5049099922180176, "learning_rate": 1.998566185582365e-05, "loss": 2.3848, "step": 820 }, { "epoch": 0.11921657678115243, "grad_norm": 3.502272367477417, "learning_rate": 1.9984853294431814e-05, "loss": 2.3255, "step": 840 }, { "epoch": 0.1220550667045132, "grad_norm": 3.4040749073028564, "learning_rate": 1.998402257299858e-05, "loss": 2.3954, "step": 860 }, { "epoch": 0.12489355662787398, "grad_norm": 3.5705454349517822, "learning_rate": 1.9983169693367612e-05, "loss": 2.3745, "step": 880 }, { "epoch": 0.12773204655123474, "grad_norm": 3.2983694076538086, "learning_rate": 1.9982294657431768e-05, "loss": 2.3696, "step": 900 }, { "epoch": 0.13057053647459552, "grad_norm": 3.2836155891418457, "learning_rate": 1.9981397467133073e-05, "loss": 2.3636, "step": 920 }, { "epoch": 0.1334090263979563, "grad_norm": 3.4141528606414795, "learning_rate": 1.9980478124462722e-05, "loss": 2.3527, "step": 940 }, { "epoch": 0.13624751632131707, "grad_norm": 3.690831422805786, "learning_rate": 1.9979536631461066e-05, "loss": 2.3042, "step": 960 }, { "epoch": 0.13908600624467782, "grad_norm": 3.2653017044067383, "learning_rate": 1.9978572990217628e-05, "loss": 2.3181, "step": 980 }, { "epoch": 0.1419244961680386, "grad_norm": 3.183215379714966, "learning_rate": 1.9977587202871077e-05, "loss": 2.3809, "step": 1000 }, { "epoch": 0.14476298609139937, "grad_norm": 3.396559238433838, "learning_rate": 1.997657927160924e-05, "loss": 2.3508, "step": 1020 }, { "epoch": 0.14760147601476015, "grad_norm": 3.143009662628174, "learning_rate": 1.997554919866908e-05, "loss": 2.2862, "step": 1040 }, { "epoch": 0.15043996593812092, "grad_norm": 3.9524123668670654, "learning_rate": 1.997449698633671e-05, "loss": 2.3166, "step": 1060 }, { "epoch": 0.1532784558614817, "grad_norm": 3.6695706844329834, "learning_rate": 1.997342263694737e-05, "loss": 2.3583, "step": 1080 }, { "epoch": 0.15611694578484248, "grad_norm": 3.47139048576355, "learning_rate": 1.997232615288544e-05, "loss": 2.2956, "step": 1100 }, { "epoch": 0.15895543570820322, "grad_norm": 4.078399658203125, "learning_rate": 1.997120753658441e-05, "loss": 2.3396, "step": 1120 }, { "epoch": 0.161793925631564, "grad_norm": 3.2317163944244385, "learning_rate": 1.9970066790526906e-05, "loss": 2.2451, "step": 1140 }, { "epoch": 0.16463241555492478, "grad_norm": 3.0501227378845215, "learning_rate": 1.996890391724466e-05, "loss": 2.2886, "step": 1160 }, { "epoch": 0.16747090547828555, "grad_norm": 3.0787248611450195, "learning_rate": 1.996771891931851e-05, "loss": 2.3157, "step": 1180 }, { "epoch": 0.17030939540164633, "grad_norm": 3.146318197250366, "learning_rate": 1.9966511799378402e-05, "loss": 2.2685, "step": 1200 }, { "epoch": 0.1731478853250071, "grad_norm": 3.36348295211792, "learning_rate": 1.996528256010338e-05, "loss": 2.3255, "step": 1220 }, { "epoch": 0.17598637524836785, "grad_norm": 3.095618963241577, "learning_rate": 1.996403120422157e-05, "loss": 2.2922, "step": 1240 }, { "epoch": 0.17882486517172863, "grad_norm": 3.5105535984039307, "learning_rate": 1.996275773451019e-05, "loss": 2.3269, "step": 1260 }, { "epoch": 0.1816633550950894, "grad_norm": 3.157728910446167, "learning_rate": 1.9961462153795533e-05, "loss": 2.3322, "step": 1280 }, { "epoch": 0.18450184501845018, "grad_norm": 3.3705925941467285, "learning_rate": 1.9960144464952976e-05, "loss": 2.287, "step": 1300 }, { "epoch": 0.18734033494181096, "grad_norm": 3.1653616428375244, "learning_rate": 1.995880467090694e-05, "loss": 2.3525, "step": 1320 }, { "epoch": 0.19017882486517174, "grad_norm": 3.0760676860809326, "learning_rate": 1.9957442774630924e-05, "loss": 2.2522, "step": 1340 }, { "epoch": 0.1930173147885325, "grad_norm": 3.45892071723938, "learning_rate": 1.9956058779147473e-05, "loss": 2.2861, "step": 1360 }, { "epoch": 0.19585580471189326, "grad_norm": 3.3116652965545654, "learning_rate": 1.995465268752818e-05, "loss": 2.3306, "step": 1380 }, { "epoch": 0.19869429463525404, "grad_norm": 3.031916618347168, "learning_rate": 1.9953224502893676e-05, "loss": 2.2679, "step": 1400 }, { "epoch": 0.2015327845586148, "grad_norm": 3.0776188373565674, "learning_rate": 1.9951774228413625e-05, "loss": 2.3735, "step": 1420 }, { "epoch": 0.2043712744819756, "grad_norm": 3.384258985519409, "learning_rate": 1.995030186730672e-05, "loss": 2.2914, "step": 1440 }, { "epoch": 0.20720976440533637, "grad_norm": 3.3402061462402344, "learning_rate": 1.994880742284066e-05, "loss": 2.3626, "step": 1460 }, { "epoch": 0.21004825432869714, "grad_norm": 2.918071746826172, "learning_rate": 1.9947290898332174e-05, "loss": 2.2889, "step": 1480 }, { "epoch": 0.21288674425205792, "grad_norm": 2.990377187728882, "learning_rate": 1.9945752297146978e-05, "loss": 2.2464, "step": 1500 }, { "epoch": 0.21572523417541867, "grad_norm": 3.105083703994751, "learning_rate": 1.994419162269979e-05, "loss": 2.294, "step": 1520 }, { "epoch": 0.21856372409877944, "grad_norm": 3.264815330505371, "learning_rate": 1.9942608878454325e-05, "loss": 2.3059, "step": 1540 }, { "epoch": 0.22140221402214022, "grad_norm": 2.8952152729034424, "learning_rate": 1.9941004067923262e-05, "loss": 2.2592, "step": 1560 }, { "epoch": 0.224240703945501, "grad_norm": 3.1130104064941406, "learning_rate": 1.993937719466827e-05, "loss": 2.3284, "step": 1580 }, { "epoch": 0.22707919386886177, "grad_norm": 2.9192771911621094, "learning_rate": 1.993772826229997e-05, "loss": 2.2846, "step": 1600 }, { "epoch": 0.22991768379222255, "grad_norm": 3.2377591133117676, "learning_rate": 1.993605727447795e-05, "loss": 2.2645, "step": 1620 }, { "epoch": 0.2327561737155833, "grad_norm": 3.0033068656921387, "learning_rate": 1.9934364234910742e-05, "loss": 2.2355, "step": 1640 }, { "epoch": 0.23559466363894407, "grad_norm": 3.1228458881378174, "learning_rate": 1.993264914735582e-05, "loss": 2.3017, "step": 1660 }, { "epoch": 0.23843315356230485, "grad_norm": 2.9229018688201904, "learning_rate": 1.993091201561959e-05, "loss": 2.2848, "step": 1680 }, { "epoch": 0.24127164348566563, "grad_norm": 3.0824317932128906, "learning_rate": 1.9929152843557386e-05, "loss": 2.2254, "step": 1700 }, { "epoch": 0.2441101334090264, "grad_norm": 2.925661325454712, "learning_rate": 1.992737163507345e-05, "loss": 2.2456, "step": 1720 }, { "epoch": 0.24694862333238718, "grad_norm": 3.018491268157959, "learning_rate": 1.9925568394120934e-05, "loss": 2.2679, "step": 1740 }, { "epoch": 0.24978711325574796, "grad_norm": 3.204108476638794, "learning_rate": 1.9923743124701896e-05, "loss": 2.2565, "step": 1760 }, { "epoch": 0.2526256031791087, "grad_norm": 3.0354995727539062, "learning_rate": 1.992189583086727e-05, "loss": 2.2631, "step": 1780 }, { "epoch": 0.2554640931024695, "grad_norm": 3.0141243934631348, "learning_rate": 1.9920026516716878e-05, "loss": 2.2565, "step": 1800 }, { "epoch": 0.25830258302583026, "grad_norm": 2.961923837661743, "learning_rate": 1.9918135186399413e-05, "loss": 2.2729, "step": 1820 }, { "epoch": 0.26114107294919103, "grad_norm": 2.9746248722076416, "learning_rate": 1.991622184411243e-05, "loss": 2.2617, "step": 1840 }, { "epoch": 0.2639795628725518, "grad_norm": 2.7948062419891357, "learning_rate": 1.991428649410233e-05, "loss": 2.2316, "step": 1860 }, { "epoch": 0.2668180527959126, "grad_norm": 2.9185383319854736, "learning_rate": 1.9912329140664368e-05, "loss": 2.2857, "step": 1880 }, { "epoch": 0.26965654271927336, "grad_norm": 2.972193479537964, "learning_rate": 1.9910349788142628e-05, "loss": 2.2455, "step": 1900 }, { "epoch": 0.27249503264263414, "grad_norm": 2.7558836936950684, "learning_rate": 1.990834844093001e-05, "loss": 2.2298, "step": 1920 }, { "epoch": 0.2753335225659949, "grad_norm": 2.7793076038360596, "learning_rate": 1.9906325103468244e-05, "loss": 2.2759, "step": 1940 }, { "epoch": 0.27817201248935564, "grad_norm": 2.748948097229004, "learning_rate": 1.990427978024785e-05, "loss": 2.1984, "step": 1960 }, { "epoch": 0.2810105024127164, "grad_norm": 2.945160388946533, "learning_rate": 1.9902212475808148e-05, "loss": 2.2617, "step": 1980 }, { "epoch": 0.2838489923360772, "grad_norm": 2.8768680095672607, "learning_rate": 1.9900123194737248e-05, "loss": 2.25, "step": 2000 }, { "epoch": 0.28668748225943796, "grad_norm": 2.8559556007385254, "learning_rate": 1.9898011941672025e-05, "loss": 2.15, "step": 2020 }, { "epoch": 0.28952597218279874, "grad_norm": 2.77445650100708, "learning_rate": 1.989587872129812e-05, "loss": 2.259, "step": 2040 }, { "epoch": 0.2923644621061595, "grad_norm": 2.8193066120147705, "learning_rate": 1.9893723538349935e-05, "loss": 2.2263, "step": 2060 }, { "epoch": 0.2952029520295203, "grad_norm": 2.9851784706115723, "learning_rate": 1.9891546397610605e-05, "loss": 2.2474, "step": 2080 }, { "epoch": 0.29804144195288107, "grad_norm": 2.82657790184021, "learning_rate": 1.9889347303912e-05, "loss": 2.2451, "step": 2100 }, { "epoch": 0.30087993187624185, "grad_norm": 2.808293104171753, "learning_rate": 1.988712626213472e-05, "loss": 2.241, "step": 2120 }, { "epoch": 0.3037184217996026, "grad_norm": 2.742231845855713, "learning_rate": 1.988488327720806e-05, "loss": 2.2332, "step": 2140 }, { "epoch": 0.3065569117229634, "grad_norm": 3.018608570098877, "learning_rate": 1.9882618354110032e-05, "loss": 2.2723, "step": 2160 }, { "epoch": 0.3093954016463242, "grad_norm": 2.884113073348999, "learning_rate": 1.9880331497867323e-05, "loss": 2.1963, "step": 2180 }, { "epoch": 0.31223389156968495, "grad_norm": 3.0162296295166016, "learning_rate": 1.987802271355531e-05, "loss": 2.2438, "step": 2200 }, { "epoch": 0.3150723814930457, "grad_norm": 2.8027822971343994, "learning_rate": 1.987569200629802e-05, "loss": 2.2643, "step": 2220 }, { "epoch": 0.31791087141640645, "grad_norm": 2.692643404006958, "learning_rate": 1.9873339381268156e-05, "loss": 2.242, "step": 2240 }, { "epoch": 0.3207493613397672, "grad_norm": 2.7886853218078613, "learning_rate": 1.9870964843687043e-05, "loss": 2.2297, "step": 2260 }, { "epoch": 0.323587851263128, "grad_norm": 3.0669069290161133, "learning_rate": 1.9868568398824654e-05, "loss": 2.2019, "step": 2280 }, { "epoch": 0.3264263411864888, "grad_norm": 2.902926206588745, "learning_rate": 1.986615005199958e-05, "loss": 2.3106, "step": 2300 }, { "epoch": 0.32926483110984955, "grad_norm": 2.7862541675567627, "learning_rate": 1.986370980857901e-05, "loss": 2.2834, "step": 2320 }, { "epoch": 0.33210332103321033, "grad_norm": 2.7140722274780273, "learning_rate": 1.986124767397874e-05, "loss": 2.2265, "step": 2340 }, { "epoch": 0.3349418109565711, "grad_norm": 2.835646152496338, "learning_rate": 1.9858763653663148e-05, "loss": 2.203, "step": 2360 }, { "epoch": 0.3377803008799319, "grad_norm": 2.9410243034362793, "learning_rate": 1.9856257753145177e-05, "loss": 2.2492, "step": 2380 }, { "epoch": 0.34061879080329266, "grad_norm": 3.247437000274658, "learning_rate": 1.985372997798635e-05, "loss": 2.2448, "step": 2400 }, { "epoch": 0.34345728072665344, "grad_norm": 2.8364458084106445, "learning_rate": 1.9851180333796714e-05, "loss": 2.2036, "step": 2420 }, { "epoch": 0.3462957706500142, "grad_norm": 2.838958263397217, "learning_rate": 1.9848608826234863e-05, "loss": 2.1771, "step": 2440 }, { "epoch": 0.349134260573375, "grad_norm": 2.862950563430786, "learning_rate": 1.9846015461007918e-05, "loss": 2.1867, "step": 2460 }, { "epoch": 0.3519727504967357, "grad_norm": 2.828343629837036, "learning_rate": 1.98434002438715e-05, "loss": 2.2224, "step": 2480 }, { "epoch": 0.3548112404200965, "grad_norm": 2.8981378078460693, "learning_rate": 1.9840763180629736e-05, "loss": 2.208, "step": 2500 }, { "epoch": 0.35764973034345726, "grad_norm": 2.8314120769500732, "learning_rate": 1.9838104277135225e-05, "loss": 2.2489, "step": 2520 }, { "epoch": 0.36048822026681804, "grad_norm": 2.586374521255493, "learning_rate": 1.9835423539289058e-05, "loss": 2.1323, "step": 2540 }, { "epoch": 0.3633267101901788, "grad_norm": 2.5036492347717285, "learning_rate": 1.983272097304077e-05, "loss": 2.2238, "step": 2560 }, { "epoch": 0.3661652001135396, "grad_norm": 2.7843472957611084, "learning_rate": 1.982999658438834e-05, "loss": 2.2389, "step": 2580 }, { "epoch": 0.36900369003690037, "grad_norm": 2.9135963916778564, "learning_rate": 1.9827250379378182e-05, "loss": 2.1979, "step": 2600 }, { "epoch": 0.37184217996026114, "grad_norm": 2.5782527923583984, "learning_rate": 1.9824482364105133e-05, "loss": 2.1661, "step": 2620 }, { "epoch": 0.3746806698836219, "grad_norm": 2.8325178623199463, "learning_rate": 1.9821692544712433e-05, "loss": 2.2074, "step": 2640 }, { "epoch": 0.3775191598069827, "grad_norm": 2.7903082370758057, "learning_rate": 1.9818880927391708e-05, "loss": 2.2277, "step": 2660 }, { "epoch": 0.3803576497303435, "grad_norm": 2.8497819900512695, "learning_rate": 1.9816047518382964e-05, "loss": 2.2395, "step": 2680 }, { "epoch": 0.38319613965370425, "grad_norm": 2.572450637817383, "learning_rate": 1.9813192323974578e-05, "loss": 2.2085, "step": 2700 }, { "epoch": 0.386034629577065, "grad_norm": 2.748239517211914, "learning_rate": 1.981031535050326e-05, "loss": 2.1899, "step": 2720 }, { "epoch": 0.3888731195004258, "grad_norm": 2.7084801197052, "learning_rate": 1.980741660435408e-05, "loss": 2.196, "step": 2740 }, { "epoch": 0.3917116094237865, "grad_norm": 2.7431952953338623, "learning_rate": 1.9804496091960404e-05, "loss": 2.1983, "step": 2760 }, { "epoch": 0.3945500993471473, "grad_norm": 2.6878323554992676, "learning_rate": 1.9801553819803926e-05, "loss": 2.2041, "step": 2780 }, { "epoch": 0.3973885892705081, "grad_norm": 2.8030176162719727, "learning_rate": 1.979858979441462e-05, "loss": 2.2061, "step": 2800 }, { "epoch": 0.40022707919386885, "grad_norm": 2.7128846645355225, "learning_rate": 1.979560402237074e-05, "loss": 2.111, "step": 2820 }, { "epoch": 0.4030655691172296, "grad_norm": 2.966721773147583, "learning_rate": 1.979259651029881e-05, "loss": 2.1677, "step": 2840 }, { "epoch": 0.4059040590405904, "grad_norm": 2.6102325916290283, "learning_rate": 1.97895672648736e-05, "loss": 2.1504, "step": 2860 }, { "epoch": 0.4087425489639512, "grad_norm": 2.9061434268951416, "learning_rate": 1.9786516292818112e-05, "loss": 2.1663, "step": 2880 }, { "epoch": 0.41158103888731196, "grad_norm": 2.603285789489746, "learning_rate": 1.9783443600903566e-05, "loss": 2.1724, "step": 2900 }, { "epoch": 0.41441952881067273, "grad_norm": 2.718766927719116, "learning_rate": 1.9780349195949392e-05, "loss": 2.2015, "step": 2920 }, { "epoch": 0.4172580187340335, "grad_norm": 2.7435643672943115, "learning_rate": 1.977723308482321e-05, "loss": 2.1991, "step": 2940 }, { "epoch": 0.4200965086573943, "grad_norm": 2.857079029083252, "learning_rate": 1.97740952744408e-05, "loss": 2.2316, "step": 2960 }, { "epoch": 0.42293499858075506, "grad_norm": 2.8442587852478027, "learning_rate": 1.977093577176612e-05, "loss": 2.2107, "step": 2980 }, { "epoch": 0.42577348850411584, "grad_norm": 2.656696081161499, "learning_rate": 1.976775458381126e-05, "loss": 2.2165, "step": 3000 }, { "epoch": 0.42861197842747656, "grad_norm": 2.761887550354004, "learning_rate": 1.976455171763644e-05, "loss": 2.1912, "step": 3020 }, { "epoch": 0.43145046835083734, "grad_norm": 2.4235520362854004, "learning_rate": 1.9761327180349988e-05, "loss": 2.167, "step": 3040 }, { "epoch": 0.4342889582741981, "grad_norm": 2.6976635456085205, "learning_rate": 1.9758080979108337e-05, "loss": 2.2195, "step": 3060 }, { "epoch": 0.4371274481975589, "grad_norm": 2.649724245071411, "learning_rate": 1.975481312111599e-05, "loss": 2.1705, "step": 3080 }, { "epoch": 0.43996593812091966, "grad_norm": 2.737680196762085, "learning_rate": 1.975152361362552e-05, "loss": 2.2257, "step": 3100 }, { "epoch": 0.44280442804428044, "grad_norm": 2.646932601928711, "learning_rate": 1.9748212463937548e-05, "loss": 2.1985, "step": 3120 }, { "epoch": 0.4456429179676412, "grad_norm": 2.6977391242980957, "learning_rate": 1.974487967940073e-05, "loss": 2.1786, "step": 3140 }, { "epoch": 0.448481407891002, "grad_norm": 2.515000581741333, "learning_rate": 1.9741525267411726e-05, "loss": 2.1654, "step": 3160 }, { "epoch": 0.45131989781436277, "grad_norm": 2.5590004920959473, "learning_rate": 1.973814923541521e-05, "loss": 2.1584, "step": 3180 }, { "epoch": 0.45415838773772355, "grad_norm": 2.5912885665893555, "learning_rate": 1.973475159090383e-05, "loss": 2.1532, "step": 3200 }, { "epoch": 0.4569968776610843, "grad_norm": 2.6389095783233643, "learning_rate": 1.9731332341418198e-05, "loss": 2.1376, "step": 3220 }, { "epoch": 0.4598353675844451, "grad_norm": 2.614657163619995, "learning_rate": 1.972789149454688e-05, "loss": 2.2243, "step": 3240 }, { "epoch": 0.4626738575078059, "grad_norm": 2.7434704303741455, "learning_rate": 1.972442905792638e-05, "loss": 2.2127, "step": 3260 }, { "epoch": 0.4655123474311666, "grad_norm": 2.6676321029663086, "learning_rate": 1.9720945039241108e-05, "loss": 2.1902, "step": 3280 }, { "epoch": 0.46835083735452737, "grad_norm": 2.798278570175171, "learning_rate": 1.9717439446223376e-05, "loss": 2.1455, "step": 3300 }, { "epoch": 0.47118932727788815, "grad_norm": 2.687197208404541, "learning_rate": 1.971391228665337e-05, "loss": 2.1823, "step": 3320 }, { "epoch": 0.4740278172012489, "grad_norm": 2.586503744125366, "learning_rate": 1.9710363568359154e-05, "loss": 2.1767, "step": 3340 }, { "epoch": 0.4768663071246097, "grad_norm": 2.6018874645233154, "learning_rate": 1.9706793299216635e-05, "loss": 2.1614, "step": 3360 }, { "epoch": 0.4797047970479705, "grad_norm": 2.604524850845337, "learning_rate": 1.970320148714954e-05, "loss": 2.1441, "step": 3380 }, { "epoch": 0.48254328697133125, "grad_norm": 2.4274983406066895, "learning_rate": 1.9699588140129415e-05, "loss": 2.2016, "step": 3400 }, { "epoch": 0.48538177689469203, "grad_norm": 2.5898208618164062, "learning_rate": 1.9695953266175598e-05, "loss": 2.2179, "step": 3420 }, { "epoch": 0.4882202668180528, "grad_norm": 2.5731120109558105, "learning_rate": 1.9692296873355208e-05, "loss": 2.16, "step": 3440 }, { "epoch": 0.4910587567414136, "grad_norm": 2.7258307933807373, "learning_rate": 1.9688618969783115e-05, "loss": 2.1519, "step": 3460 }, { "epoch": 0.49389724666477436, "grad_norm": 2.662222146987915, "learning_rate": 1.968491956362193e-05, "loss": 2.1328, "step": 3480 }, { "epoch": 0.49673573658813513, "grad_norm": 2.6426608562469482, "learning_rate": 1.9681198663081987e-05, "loss": 2.207, "step": 3500 }, { "epoch": 0.4995742265114959, "grad_norm": 2.7795515060424805, "learning_rate": 1.9677456276421334e-05, "loss": 2.1619, "step": 3520 }, { "epoch": 0.5024127164348566, "grad_norm": 2.4003286361694336, "learning_rate": 1.967369241194569e-05, "loss": 2.193, "step": 3540 }, { "epoch": 0.5052512063582174, "grad_norm": 2.5916852951049805, "learning_rate": 1.9669907078008447e-05, "loss": 2.1921, "step": 3560 }, { "epoch": 0.5080896962815782, "grad_norm": 2.6541996002197266, "learning_rate": 1.9666100283010646e-05, "loss": 2.2601, "step": 3580 }, { "epoch": 0.510928186204939, "grad_norm": 2.414781332015991, "learning_rate": 1.9662272035400956e-05, "loss": 2.2281, "step": 3600 }, { "epoch": 0.5137666761282997, "grad_norm": 2.641268730163574, "learning_rate": 1.965842234367566e-05, "loss": 2.2028, "step": 3620 }, { "epoch": 0.5166051660516605, "grad_norm": 2.681506633758545, "learning_rate": 1.9654551216378635e-05, "loss": 2.1499, "step": 3640 }, { "epoch": 0.5194436559750213, "grad_norm": 2.6079370975494385, "learning_rate": 1.965065866210132e-05, "loss": 2.1214, "step": 3660 }, { "epoch": 0.5222821458983821, "grad_norm": 2.673679828643799, "learning_rate": 1.9646744689482725e-05, "loss": 2.1564, "step": 3680 }, { "epoch": 0.5251206358217428, "grad_norm": 2.4398257732391357, "learning_rate": 1.9642809307209383e-05, "loss": 2.1304, "step": 3700 }, { "epoch": 0.5279591257451036, "grad_norm": 2.6184492111206055, "learning_rate": 1.963885252401535e-05, "loss": 2.1672, "step": 3720 }, { "epoch": 0.5307976156684644, "grad_norm": 2.2611680030822754, "learning_rate": 1.9634874348682166e-05, "loss": 2.2151, "step": 3740 }, { "epoch": 0.5336361055918252, "grad_norm": 2.4822325706481934, "learning_rate": 1.9630874790038866e-05, "loss": 2.1541, "step": 3760 }, { "epoch": 0.536474595515186, "grad_norm": 2.8098084926605225, "learning_rate": 1.9626853856961933e-05, "loss": 2.1712, "step": 3780 }, { "epoch": 0.5393130854385467, "grad_norm": 2.5705819129943848, "learning_rate": 1.9622811558375284e-05, "loss": 2.1451, "step": 3800 }, { "epoch": 0.5421515753619075, "grad_norm": 2.613226890563965, "learning_rate": 1.9618747903250264e-05, "loss": 2.1573, "step": 3820 }, { "epoch": 0.5449900652852683, "grad_norm": 2.6075191497802734, "learning_rate": 1.9614662900605602e-05, "loss": 2.1508, "step": 3840 }, { "epoch": 0.547828555208629, "grad_norm": 2.4439051151275635, "learning_rate": 1.9610556559507418e-05, "loss": 2.1322, "step": 3860 }, { "epoch": 0.5506670451319898, "grad_norm": 2.9040069580078125, "learning_rate": 1.9606428889069182e-05, "loss": 2.1463, "step": 3880 }, { "epoch": 0.5535055350553506, "grad_norm": 2.359039545059204, "learning_rate": 1.960227989845171e-05, "loss": 2.1544, "step": 3900 }, { "epoch": 0.5563440249787113, "grad_norm": 2.526554822921753, "learning_rate": 1.959810959686312e-05, "loss": 2.2045, "step": 3920 }, { "epoch": 0.559182514902072, "grad_norm": 2.6207809448242188, "learning_rate": 1.9593917993558848e-05, "loss": 2.1386, "step": 3940 }, { "epoch": 0.5620210048254328, "grad_norm": 2.5125861167907715, "learning_rate": 1.9589705097841586e-05, "loss": 2.1219, "step": 3960 }, { "epoch": 0.5648594947487936, "grad_norm": 2.6025166511535645, "learning_rate": 1.9585470919061294e-05, "loss": 2.119, "step": 3980 }, { "epoch": 0.5676979846721544, "grad_norm": 2.5204999446868896, "learning_rate": 1.9581215466615166e-05, "loss": 2.1595, "step": 4000 }, { "epoch": 0.5705364745955152, "grad_norm": 2.5611536502838135, "learning_rate": 1.95769387499476e-05, "loss": 2.1868, "step": 4020 }, { "epoch": 0.5733749645188759, "grad_norm": 2.5856382846832275, "learning_rate": 1.95726407785502e-05, "loss": 2.2, "step": 4040 }, { "epoch": 0.5762134544422367, "grad_norm": 2.6095612049102783, "learning_rate": 1.9568321561961737e-05, "loss": 2.1706, "step": 4060 }, { "epoch": 0.5790519443655975, "grad_norm": 2.4537999629974365, "learning_rate": 1.956398110976813e-05, "loss": 2.1302, "step": 4080 }, { "epoch": 0.5818904342889583, "grad_norm": 2.474921941757202, "learning_rate": 1.9559619431602427e-05, "loss": 2.1562, "step": 4100 }, { "epoch": 0.584728924212319, "grad_norm": 2.456381320953369, "learning_rate": 1.9555236537144795e-05, "loss": 2.1716, "step": 4120 }, { "epoch": 0.5875674141356798, "grad_norm": 2.806540012359619, "learning_rate": 1.9550832436122475e-05, "loss": 2.1678, "step": 4140 }, { "epoch": 0.5904059040590406, "grad_norm": 2.48787260055542, "learning_rate": 1.954640713830978e-05, "loss": 2.1322, "step": 4160 }, { "epoch": 0.5932443939824014, "grad_norm": 2.5663275718688965, "learning_rate": 1.954196065352806e-05, "loss": 2.137, "step": 4180 }, { "epoch": 0.5960828839057621, "grad_norm": 2.3451528549194336, "learning_rate": 1.9537492991645695e-05, "loss": 2.1941, "step": 4200 }, { "epoch": 0.5989213738291229, "grad_norm": 2.5931131839752197, "learning_rate": 1.953300416257806e-05, "loss": 2.1699, "step": 4220 }, { "epoch": 0.6017598637524837, "grad_norm": 2.4558968544006348, "learning_rate": 1.9528494176287507e-05, "loss": 2.1588, "step": 4240 }, { "epoch": 0.6045983536758445, "grad_norm": 2.2663233280181885, "learning_rate": 1.9523963042783346e-05, "loss": 2.1294, "step": 4260 }, { "epoch": 0.6074368435992052, "grad_norm": 2.6679797172546387, "learning_rate": 1.9519410772121823e-05, "loss": 2.1565, "step": 4280 }, { "epoch": 0.610275333522566, "grad_norm": 2.4935338497161865, "learning_rate": 1.9514837374406082e-05, "loss": 2.1256, "step": 4300 }, { "epoch": 0.6131138234459268, "grad_norm": 2.3807930946350098, "learning_rate": 1.951024285978618e-05, "loss": 2.1397, "step": 4320 }, { "epoch": 0.6159523133692876, "grad_norm": 2.400590419769287, "learning_rate": 1.950562723845901e-05, "loss": 2.1006, "step": 4340 }, { "epoch": 0.6187908032926484, "grad_norm": 2.517772912979126, "learning_rate": 1.9500990520668335e-05, "loss": 2.1378, "step": 4360 }, { "epoch": 0.6216292932160091, "grad_norm": 2.5135929584503174, "learning_rate": 1.9496332716704723e-05, "loss": 2.1251, "step": 4380 }, { "epoch": 0.6244677831393699, "grad_norm": 2.643582582473755, "learning_rate": 1.9491653836905543e-05, "loss": 2.1378, "step": 4400 }, { "epoch": 0.6273062730627307, "grad_norm": 2.488123893737793, "learning_rate": 1.9486953891654944e-05, "loss": 2.1033, "step": 4420 }, { "epoch": 0.6301447629860913, "grad_norm": 2.365532636642456, "learning_rate": 1.9482232891383825e-05, "loss": 2.153, "step": 4440 }, { "epoch": 0.6329832529094521, "grad_norm": 2.4700963497161865, "learning_rate": 1.9477490846569807e-05, "loss": 2.1167, "step": 4460 }, { "epoch": 0.6358217428328129, "grad_norm": 2.506258487701416, "learning_rate": 1.9472727767737224e-05, "loss": 2.2042, "step": 4480 }, { "epoch": 0.6386602327561737, "grad_norm": 2.416388511657715, "learning_rate": 1.9467943665457088e-05, "loss": 2.1554, "step": 4500 }, { "epoch": 0.6414987226795344, "grad_norm": 2.4400341510772705, "learning_rate": 1.9463138550347075e-05, "loss": 2.171, "step": 4520 }, { "epoch": 0.6443372126028952, "grad_norm": 2.362725019454956, "learning_rate": 1.9458312433071496e-05, "loss": 2.1572, "step": 4540 }, { "epoch": 0.647175702526256, "grad_norm": 2.371147394180298, "learning_rate": 1.9453465324341266e-05, "loss": 2.1663, "step": 4560 }, { "epoch": 0.6500141924496168, "grad_norm": 2.58335280418396, "learning_rate": 1.944859723491389e-05, "loss": 2.1297, "step": 4580 }, { "epoch": 0.6528526823729776, "grad_norm": 2.490827798843384, "learning_rate": 1.944370817559344e-05, "loss": 2.2035, "step": 4600 }, { "epoch": 0.6556911722963383, "grad_norm": 2.4635965824127197, "learning_rate": 1.943879815723053e-05, "loss": 2.1013, "step": 4620 }, { "epoch": 0.6585296622196991, "grad_norm": 2.5876410007476807, "learning_rate": 1.9433867190722285e-05, "loss": 2.1447, "step": 4640 }, { "epoch": 0.6613681521430599, "grad_norm": 2.5840892791748047, "learning_rate": 1.942891528701232e-05, "loss": 2.1644, "step": 4660 }, { "epoch": 0.6642066420664207, "grad_norm": 2.5741055011749268, "learning_rate": 1.942394245709073e-05, "loss": 2.1624, "step": 4680 }, { "epoch": 0.6670451319897814, "grad_norm": 2.302178144454956, "learning_rate": 1.941894871199403e-05, "loss": 2.0782, "step": 4700 }, { "epoch": 0.6698836219131422, "grad_norm": 2.483271598815918, "learning_rate": 1.9413934062805176e-05, "loss": 2.1564, "step": 4720 }, { "epoch": 0.672722111836503, "grad_norm": 2.4705843925476074, "learning_rate": 1.9408898520653508e-05, "loss": 2.1482, "step": 4740 }, { "epoch": 0.6755606017598638, "grad_norm": 2.606477737426758, "learning_rate": 1.940384209671473e-05, "loss": 2.0988, "step": 4760 }, { "epoch": 0.6783990916832245, "grad_norm": 2.4687209129333496, "learning_rate": 1.9398764802210904e-05, "loss": 2.0926, "step": 4780 }, { "epoch": 0.6812375816065853, "grad_norm": 2.4011576175689697, "learning_rate": 1.93936666484104e-05, "loss": 2.1136, "step": 4800 }, { "epoch": 0.6840760715299461, "grad_norm": 2.5158729553222656, "learning_rate": 1.9388547646627884e-05, "loss": 2.1232, "step": 4820 }, { "epoch": 0.6869145614533069, "grad_norm": 2.484632730484009, "learning_rate": 1.9383407808224294e-05, "loss": 2.1755, "step": 4840 }, { "epoch": 0.6897530513766676, "grad_norm": 2.397512674331665, "learning_rate": 1.937824714460681e-05, "loss": 2.0773, "step": 4860 }, { "epoch": 0.6925915413000284, "grad_norm": 2.4654295444488525, "learning_rate": 1.937306566722884e-05, "loss": 2.1278, "step": 4880 }, { "epoch": 0.6954300312233892, "grad_norm": 2.4697375297546387, "learning_rate": 1.9367863387589968e-05, "loss": 2.1132, "step": 4900 }, { "epoch": 0.69826852114675, "grad_norm": 2.4779441356658936, "learning_rate": 1.936264031723596e-05, "loss": 2.1526, "step": 4920 }, { "epoch": 0.7011070110701108, "grad_norm": 2.50618839263916, "learning_rate": 1.9357396467758717e-05, "loss": 2.0745, "step": 4940 }, { "epoch": 0.7039455009934714, "grad_norm": 2.423607587814331, "learning_rate": 1.935213185079626e-05, "loss": 2.1256, "step": 4960 }, { "epoch": 0.7067839909168322, "grad_norm": 2.4815824031829834, "learning_rate": 1.93468464780327e-05, "loss": 2.1666, "step": 4980 }, { "epoch": 0.709622480840193, "grad_norm": 2.480020046234131, "learning_rate": 1.9341540361198214e-05, "loss": 2.1197, "step": 5000 }, { "epoch": 0.7124609707635537, "grad_norm": 2.4548122882843018, "learning_rate": 1.9336213512069013e-05, "loss": 2.1094, "step": 5020 }, { "epoch": 0.7152994606869145, "grad_norm": 2.5091676712036133, "learning_rate": 1.9330865942467325e-05, "loss": 2.1136, "step": 5040 }, { "epoch": 0.7181379506102753, "grad_norm": 2.4026217460632324, "learning_rate": 1.9325497664261362e-05, "loss": 2.0612, "step": 5060 }, { "epoch": 0.7209764405336361, "grad_norm": 2.4376442432403564, "learning_rate": 1.9320108689365296e-05, "loss": 2.1272, "step": 5080 }, { "epoch": 0.7238149304569969, "grad_norm": 2.2924327850341797, "learning_rate": 1.9314699029739236e-05, "loss": 2.0969, "step": 5100 }, { "epoch": 0.7266534203803576, "grad_norm": 2.360736131668091, "learning_rate": 1.9309268697389192e-05, "loss": 2.0903, "step": 5120 }, { "epoch": 0.7294919103037184, "grad_norm": 2.524545669555664, "learning_rate": 1.9303817704367062e-05, "loss": 2.1305, "step": 5140 }, { "epoch": 0.7323304002270792, "grad_norm": 2.470212936401367, "learning_rate": 1.9298346062770586e-05, "loss": 2.1462, "step": 5160 }, { "epoch": 0.73516889015044, "grad_norm": 2.351810932159424, "learning_rate": 1.9292853784743348e-05, "loss": 2.1547, "step": 5180 }, { "epoch": 0.7380073800738007, "grad_norm": 2.469377040863037, "learning_rate": 1.9287340882474714e-05, "loss": 2.0977, "step": 5200 }, { "epoch": 0.7408458699971615, "grad_norm": 2.6058297157287598, "learning_rate": 1.9281807368199833e-05, "loss": 2.1303, "step": 5220 }, { "epoch": 0.7436843599205223, "grad_norm": 2.3007125854492188, "learning_rate": 1.9276253254199594e-05, "loss": 2.1306, "step": 5240 }, { "epoch": 0.7465228498438831, "grad_norm": 2.489659070968628, "learning_rate": 1.927067855280061e-05, "loss": 2.0775, "step": 5260 }, { "epoch": 0.7493613397672438, "grad_norm": 2.411015748977661, "learning_rate": 1.926508327637519e-05, "loss": 2.1206, "step": 5280 }, { "epoch": 0.7521998296906046, "grad_norm": 2.344170570373535, "learning_rate": 1.9259467437341282e-05, "loss": 2.0912, "step": 5300 }, { "epoch": 0.7550383196139654, "grad_norm": 2.503267765045166, "learning_rate": 1.92538310481625e-05, "loss": 2.1283, "step": 5320 }, { "epoch": 0.7578768095373262, "grad_norm": 2.4120101928710938, "learning_rate": 1.9248174121348046e-05, "loss": 2.0932, "step": 5340 }, { "epoch": 0.760715299460687, "grad_norm": 2.414590358734131, "learning_rate": 1.9242496669452717e-05, "loss": 2.1218, "step": 5360 }, { "epoch": 0.7635537893840477, "grad_norm": 2.3509671688079834, "learning_rate": 1.9236798705076852e-05, "loss": 2.1228, "step": 5380 }, { "epoch": 0.7663922793074085, "grad_norm": 2.5387187004089355, "learning_rate": 1.9231080240866312e-05, "loss": 2.1107, "step": 5400 }, { "epoch": 0.7692307692307693, "grad_norm": 2.4209885597229004, "learning_rate": 1.922534128951247e-05, "loss": 2.1458, "step": 5420 }, { "epoch": 0.77206925915413, "grad_norm": 2.643648386001587, "learning_rate": 1.9219581863752154e-05, "loss": 2.0916, "step": 5440 }, { "epoch": 0.7749077490774908, "grad_norm": 2.45778489112854, "learning_rate": 1.9213801976367633e-05, "loss": 2.1203, "step": 5460 }, { "epoch": 0.7777462390008516, "grad_norm": 2.3182876110076904, "learning_rate": 1.9208001640186596e-05, "loss": 2.1384, "step": 5480 }, { "epoch": 0.7805847289242123, "grad_norm": 2.493279457092285, "learning_rate": 1.9202180868082108e-05, "loss": 2.0429, "step": 5500 }, { "epoch": 0.783423218847573, "grad_norm": 2.3751583099365234, "learning_rate": 1.919633967297259e-05, "loss": 2.1104, "step": 5520 }, { "epoch": 0.7862617087709338, "grad_norm": 2.4290273189544678, "learning_rate": 1.9190478067821794e-05, "loss": 2.0739, "step": 5540 }, { "epoch": 0.7891001986942946, "grad_norm": 2.339836359024048, "learning_rate": 1.918459606563876e-05, "loss": 2.0902, "step": 5560 }, { "epoch": 0.7919386886176554, "grad_norm": 2.305302381515503, "learning_rate": 1.917869367947781e-05, "loss": 2.1053, "step": 5580 }, { "epoch": 0.7947771785410161, "grad_norm": 2.285282850265503, "learning_rate": 1.9172770922438495e-05, "loss": 2.0878, "step": 5600 }, { "epoch": 0.7976156684643769, "grad_norm": 2.3140029907226562, "learning_rate": 1.9166827807665575e-05, "loss": 2.0731, "step": 5620 }, { "epoch": 0.8004541583877377, "grad_norm": 2.3113839626312256, "learning_rate": 1.9160864348348998e-05, "loss": 2.1642, "step": 5640 }, { "epoch": 0.8032926483110985, "grad_norm": 2.3925836086273193, "learning_rate": 1.9154880557723864e-05, "loss": 2.156, "step": 5660 }, { "epoch": 0.8061311382344593, "grad_norm": 2.4411497116088867, "learning_rate": 1.9148876449070387e-05, "loss": 2.068, "step": 5680 }, { "epoch": 0.80896962815782, "grad_norm": 2.390047311782837, "learning_rate": 1.9142852035713885e-05, "loss": 2.1057, "step": 5700 }, { "epoch": 0.8118081180811808, "grad_norm": 2.4297895431518555, "learning_rate": 1.9136807331024733e-05, "loss": 2.0524, "step": 5720 }, { "epoch": 0.8146466080045416, "grad_norm": 2.3759422302246094, "learning_rate": 1.9130742348418344e-05, "loss": 2.1312, "step": 5740 }, { "epoch": 0.8174850979279024, "grad_norm": 2.458104372024536, "learning_rate": 1.9124657101355128e-05, "loss": 2.1641, "step": 5760 }, { "epoch": 0.8203235878512631, "grad_norm": 2.372835159301758, "learning_rate": 1.9118551603340477e-05, "loss": 2.0928, "step": 5780 }, { "epoch": 0.8231620777746239, "grad_norm": 2.4486286640167236, "learning_rate": 1.9112425867924722e-05, "loss": 2.13, "step": 5800 }, { "epoch": 0.8260005676979847, "grad_norm": 2.46341609954834, "learning_rate": 1.9106279908703115e-05, "loss": 2.1356, "step": 5820 }, { "epoch": 0.8288390576213455, "grad_norm": 2.382859945297241, "learning_rate": 1.9100113739315783e-05, "loss": 2.1494, "step": 5840 }, { "epoch": 0.8316775475447062, "grad_norm": 2.243987798690796, "learning_rate": 1.9093927373447713e-05, "loss": 2.1147, "step": 5860 }, { "epoch": 0.834516037468067, "grad_norm": 2.3538975715637207, "learning_rate": 1.908772082482871e-05, "loss": 2.0992, "step": 5880 }, { "epoch": 0.8373545273914278, "grad_norm": 2.6704299449920654, "learning_rate": 1.908149410723338e-05, "loss": 2.1547, "step": 5900 }, { "epoch": 0.8401930173147886, "grad_norm": 2.307900905609131, "learning_rate": 1.9075247234481083e-05, "loss": 2.0777, "step": 5920 }, { "epoch": 0.8430315072381493, "grad_norm": 2.278573989868164, "learning_rate": 1.906898022043592e-05, "loss": 2.0842, "step": 5940 }, { "epoch": 0.8458699971615101, "grad_norm": 2.369697093963623, "learning_rate": 1.906269307900668e-05, "loss": 2.1228, "step": 5960 }, { "epoch": 0.8487084870848709, "grad_norm": 2.3282201290130615, "learning_rate": 1.9056385824146834e-05, "loss": 2.102, "step": 5980 }, { "epoch": 0.8515469770082317, "grad_norm": 2.2993173599243164, "learning_rate": 1.905005846985449e-05, "loss": 2.0966, "step": 6000 }, { "epoch": 0.8543854669315923, "grad_norm": 2.2311320304870605, "learning_rate": 1.9043711030172356e-05, "loss": 2.0833, "step": 6020 }, { "epoch": 0.8572239568549531, "grad_norm": 2.5061659812927246, "learning_rate": 1.903734351918772e-05, "loss": 2.0823, "step": 6040 }, { "epoch": 0.8600624467783139, "grad_norm": 2.3820419311523438, "learning_rate": 1.903095595103243e-05, "loss": 2.1062, "step": 6060 }, { "epoch": 0.8629009367016747, "grad_norm": 2.362177848815918, "learning_rate": 1.902454833988282e-05, "loss": 2.1642, "step": 6080 }, { "epoch": 0.8657394266250354, "grad_norm": 2.4554989337921143, "learning_rate": 1.9018120699959738e-05, "loss": 2.0693, "step": 6100 }, { "epoch": 0.8685779165483962, "grad_norm": 2.3069634437561035, "learning_rate": 1.9011673045528454e-05, "loss": 2.1496, "step": 6120 }, { "epoch": 0.871416406471757, "grad_norm": 2.2620880603790283, "learning_rate": 1.900520539089868e-05, "loss": 2.0929, "step": 6140 }, { "epoch": 0.8742548963951178, "grad_norm": 2.4633076190948486, "learning_rate": 1.8998717750424508e-05, "loss": 2.0944, "step": 6160 }, { "epoch": 0.8770933863184786, "grad_norm": 2.3488638401031494, "learning_rate": 1.8992210138504376e-05, "loss": 2.0099, "step": 6180 }, { "epoch": 0.8799318762418393, "grad_norm": 2.4129247665405273, "learning_rate": 1.8985682569581065e-05, "loss": 2.0838, "step": 6200 }, { "epoch": 0.8827703661652001, "grad_norm": 2.3044981956481934, "learning_rate": 1.8979135058141634e-05, "loss": 2.0776, "step": 6220 }, { "epoch": 0.8856088560885609, "grad_norm": 2.3968305587768555, "learning_rate": 1.8972567618717407e-05, "loss": 2.1232, "step": 6240 }, { "epoch": 0.8884473460119217, "grad_norm": 2.3710880279541016, "learning_rate": 1.896598026588393e-05, "loss": 2.0906, "step": 6260 }, { "epoch": 0.8912858359352824, "grad_norm": 2.243549108505249, "learning_rate": 1.8959373014260955e-05, "loss": 2.0823, "step": 6280 }, { "epoch": 0.8941243258586432, "grad_norm": 2.321796178817749, "learning_rate": 1.8952745878512383e-05, "loss": 2.057, "step": 6300 }, { "epoch": 0.896962815782004, "grad_norm": 2.2175071239471436, "learning_rate": 1.8946098873346255e-05, "loss": 2.0836, "step": 6320 }, { "epoch": 0.8998013057053648, "grad_norm": 2.2249510288238525, "learning_rate": 1.8939432013514703e-05, "loss": 2.1118, "step": 6340 }, { "epoch": 0.9026397956287255, "grad_norm": 2.4504356384277344, "learning_rate": 1.8932745313813936e-05, "loss": 2.1, "step": 6360 }, { "epoch": 0.9054782855520863, "grad_norm": 2.5590271949768066, "learning_rate": 1.892603878908418e-05, "loss": 2.0572, "step": 6380 }, { "epoch": 0.9083167754754471, "grad_norm": 2.3541266918182373, "learning_rate": 1.8919312454209668e-05, "loss": 2.0803, "step": 6400 }, { "epoch": 0.9111552653988079, "grad_norm": 2.341197967529297, "learning_rate": 1.8912566324118597e-05, "loss": 2.061, "step": 6420 }, { "epoch": 0.9139937553221686, "grad_norm": 2.3290467262268066, "learning_rate": 1.8905800413783094e-05, "loss": 2.0822, "step": 6440 }, { "epoch": 0.9168322452455294, "grad_norm": 2.369293689727783, "learning_rate": 1.8899014738219193e-05, "loss": 2.1059, "step": 6460 }, { "epoch": 0.9196707351688902, "grad_norm": 2.233910083770752, "learning_rate": 1.889220931248679e-05, "loss": 2.0608, "step": 6480 }, { "epoch": 0.922509225092251, "grad_norm": 2.340981960296631, "learning_rate": 1.888538415168961e-05, "loss": 2.0962, "step": 6500 }, { "epoch": 0.9253477150156117, "grad_norm": 2.313319444656372, "learning_rate": 1.8878539270975187e-05, "loss": 2.0668, "step": 6520 }, { "epoch": 0.9281862049389724, "grad_norm": 2.390779972076416, "learning_rate": 1.887167468553481e-05, "loss": 2.0669, "step": 6540 }, { "epoch": 0.9310246948623332, "grad_norm": 2.2364439964294434, "learning_rate": 1.8864790410603502e-05, "loss": 2.0499, "step": 6560 }, { "epoch": 0.933863184785694, "grad_norm": 2.373263359069824, "learning_rate": 1.885788646145999e-05, "loss": 2.0726, "step": 6580 }, { "epoch": 0.9367016747090547, "grad_norm": 2.4214510917663574, "learning_rate": 1.885096285342667e-05, "loss": 2.0842, "step": 6600 }, { "epoch": 0.9395401646324155, "grad_norm": 2.322636842727661, "learning_rate": 1.884401960186955e-05, "loss": 2.0332, "step": 6620 }, { "epoch": 0.9423786545557763, "grad_norm": 2.3433969020843506, "learning_rate": 1.8837056722198247e-05, "loss": 2.0568, "step": 6640 }, { "epoch": 0.9452171444791371, "grad_norm": 2.275583505630493, "learning_rate": 1.883007422986594e-05, "loss": 2.077, "step": 6660 }, { "epoch": 0.9480556344024978, "grad_norm": 2.2369987964630127, "learning_rate": 1.882307214036933e-05, "loss": 2.0441, "step": 6680 }, { "epoch": 0.9508941243258586, "grad_norm": 2.3637499809265137, "learning_rate": 1.8816050469248624e-05, "loss": 2.0672, "step": 6700 }, { "epoch": 0.9537326142492194, "grad_norm": 2.290034055709839, "learning_rate": 1.8809009232087465e-05, "loss": 2.1043, "step": 6720 }, { "epoch": 0.9565711041725802, "grad_norm": 2.282463312149048, "learning_rate": 1.880194844451294e-05, "loss": 2.0738, "step": 6740 }, { "epoch": 0.959409594095941, "grad_norm": 2.4664204120635986, "learning_rate": 1.8794868122195522e-05, "loss": 2.0543, "step": 6760 }, { "epoch": 0.9622480840193017, "grad_norm": 2.2567498683929443, "learning_rate": 1.8787768280849033e-05, "loss": 2.0605, "step": 6780 }, { "epoch": 0.9650865739426625, "grad_norm": 2.308612108230591, "learning_rate": 1.8780648936230615e-05, "loss": 2.0828, "step": 6800 }, { "epoch": 0.9679250638660233, "grad_norm": 2.340193510055542, "learning_rate": 1.87735101041407e-05, "loss": 2.021, "step": 6820 }, { "epoch": 0.9707635537893841, "grad_norm": 2.4724154472351074, "learning_rate": 1.876635180042297e-05, "loss": 2.0994, "step": 6840 }, { "epoch": 0.9736020437127448, "grad_norm": 2.3213677406311035, "learning_rate": 1.8759174040964313e-05, "loss": 2.0536, "step": 6860 }, { "epoch": 0.9764405336361056, "grad_norm": 2.1988067626953125, "learning_rate": 1.875197684169481e-05, "loss": 2.072, "step": 6880 }, { "epoch": 0.9792790235594664, "grad_norm": 2.327683687210083, "learning_rate": 1.8744760218587675e-05, "loss": 2.0259, "step": 6900 }, { "epoch": 0.9821175134828272, "grad_norm": 2.3538079261779785, "learning_rate": 1.8737524187659228e-05, "loss": 2.065, "step": 6920 }, { "epoch": 0.9849560034061879, "grad_norm": 2.3498942852020264, "learning_rate": 1.873026876496888e-05, "loss": 2.0479, "step": 6940 }, { "epoch": 0.9877944933295487, "grad_norm": 2.319528341293335, "learning_rate": 1.872299396661906e-05, "loss": 2.097, "step": 6960 }, { "epoch": 0.9906329832529095, "grad_norm": 2.2448389530181885, "learning_rate": 1.8715699808755205e-05, "loss": 2.1037, "step": 6980 }, { "epoch": 0.9934714731762703, "grad_norm": 2.3938522338867188, "learning_rate": 1.8708386307565724e-05, "loss": 2.1017, "step": 7000 }, { "epoch": 0.996309963099631, "grad_norm": 2.3075506687164307, "learning_rate": 1.870105347928195e-05, "loss": 2.0782, "step": 7020 }, { "epoch": 0.9991484530229918, "grad_norm": 2.267638683319092, "learning_rate": 1.8693701340178108e-05, "loss": 2.0568, "step": 7040 }, { "epoch": 1.0019869429463526, "grad_norm": 2.5143609046936035, "learning_rate": 1.8686329906571286e-05, "loss": 1.9167, "step": 7060 }, { "epoch": 1.0048254328697133, "grad_norm": 2.537221670150757, "learning_rate": 1.8678939194821395e-05, "loss": 1.9524, "step": 7080 }, { "epoch": 1.0076639227930742, "grad_norm": 2.3169100284576416, "learning_rate": 1.8671529221331117e-05, "loss": 1.9257, "step": 7100 }, { "epoch": 1.0105024127164348, "grad_norm": 2.2808802127838135, "learning_rate": 1.8664100002545903e-05, "loss": 1.9233, "step": 7120 }, { "epoch": 1.0133409026397957, "grad_norm": 2.44062876701355, "learning_rate": 1.8656651554953903e-05, "loss": 1.8698, "step": 7140 }, { "epoch": 1.0161793925631564, "grad_norm": 2.4965016841888428, "learning_rate": 1.864918389508595e-05, "loss": 1.9444, "step": 7160 }, { "epoch": 1.0190178824865173, "grad_norm": 2.4911811351776123, "learning_rate": 1.8641697039515506e-05, "loss": 1.8722, "step": 7180 }, { "epoch": 1.021856372409878, "grad_norm": 2.465142250061035, "learning_rate": 1.8634191004858646e-05, "loss": 1.9252, "step": 7200 }, { "epoch": 1.0246948623332388, "grad_norm": 2.3688101768493652, "learning_rate": 1.8626665807774007e-05, "loss": 1.9183, "step": 7220 }, { "epoch": 1.0275333522565995, "grad_norm": 2.4687576293945312, "learning_rate": 1.861912146496275e-05, "loss": 1.8692, "step": 7240 }, { "epoch": 1.0303718421799604, "grad_norm": 2.4359517097473145, "learning_rate": 1.8611557993168535e-05, "loss": 1.8457, "step": 7260 }, { "epoch": 1.033210332103321, "grad_norm": 2.481811285018921, "learning_rate": 1.860397540917747e-05, "loss": 1.8561, "step": 7280 }, { "epoch": 1.036048822026682, "grad_norm": 2.4005370140075684, "learning_rate": 1.859637372981808e-05, "loss": 1.9092, "step": 7300 }, { "epoch": 1.0388873119500426, "grad_norm": 2.5886898040771484, "learning_rate": 1.858875297196128e-05, "loss": 1.9129, "step": 7320 }, { "epoch": 1.0417258018734032, "grad_norm": 2.3961257934570312, "learning_rate": 1.858111315252031e-05, "loss": 1.8916, "step": 7340 }, { "epoch": 1.0445642917967641, "grad_norm": 2.536820888519287, "learning_rate": 1.857345428845073e-05, "loss": 1.9122, "step": 7360 }, { "epoch": 1.0474027817201248, "grad_norm": 2.53293776512146, "learning_rate": 1.8565776396750353e-05, "loss": 1.9197, "step": 7380 }, { "epoch": 1.0502412716434857, "grad_norm": 2.3372082710266113, "learning_rate": 1.8558079494459237e-05, "loss": 1.861, "step": 7400 }, { "epoch": 1.0530797615668464, "grad_norm": 2.479196310043335, "learning_rate": 1.8550363598659617e-05, "loss": 1.8691, "step": 7420 }, { "epoch": 1.0559182514902072, "grad_norm": 2.287871837615967, "learning_rate": 1.8542628726475894e-05, "loss": 1.9174, "step": 7440 }, { "epoch": 1.058756741413568, "grad_norm": 2.4585983753204346, "learning_rate": 1.8534874895074572e-05, "loss": 1.9104, "step": 7460 }, { "epoch": 1.0615952313369288, "grad_norm": 2.4119174480438232, "learning_rate": 1.852710212166424e-05, "loss": 1.8488, "step": 7480 }, { "epoch": 1.0644337212602895, "grad_norm": 2.519225597381592, "learning_rate": 1.8519310423495533e-05, "loss": 1.896, "step": 7500 }, { "epoch": 1.0672722111836503, "grad_norm": 2.413775682449341, "learning_rate": 1.851149981786107e-05, "loss": 1.9284, "step": 7520 }, { "epoch": 1.070110701107011, "grad_norm": 2.424511432647705, "learning_rate": 1.8503670322095444e-05, "loss": 1.8937, "step": 7540 }, { "epoch": 1.072949191030372, "grad_norm": 2.4528841972351074, "learning_rate": 1.849582195357517e-05, "loss": 1.872, "step": 7560 }, { "epoch": 1.0757876809537326, "grad_norm": 2.2939915657043457, "learning_rate": 1.8487954729718648e-05, "loss": 1.8673, "step": 7580 }, { "epoch": 1.0786261708770934, "grad_norm": 2.695927143096924, "learning_rate": 1.848006866798613e-05, "loss": 1.8559, "step": 7600 }, { "epoch": 1.0814646608004541, "grad_norm": 2.440675735473633, "learning_rate": 1.8472163785879663e-05, "loss": 1.8835, "step": 7620 }, { "epoch": 1.084303150723815, "grad_norm": 2.141202211380005, "learning_rate": 1.8464240100943076e-05, "loss": 1.8608, "step": 7640 }, { "epoch": 1.0871416406471757, "grad_norm": 2.4342498779296875, "learning_rate": 1.8456297630761922e-05, "loss": 1.8441, "step": 7660 }, { "epoch": 1.0899801305705366, "grad_norm": 2.3985726833343506, "learning_rate": 1.8448336392963452e-05, "loss": 1.9171, "step": 7680 }, { "epoch": 1.0928186204938972, "grad_norm": 2.446610450744629, "learning_rate": 1.844035640521656e-05, "loss": 1.922, "step": 7700 }, { "epoch": 1.095657110417258, "grad_norm": 2.3196372985839844, "learning_rate": 1.8432357685231764e-05, "loss": 1.9538, "step": 7720 }, { "epoch": 1.0984956003406188, "grad_norm": 2.441157817840576, "learning_rate": 1.8424340250761138e-05, "loss": 1.8832, "step": 7740 }, { "epoch": 1.1013340902639797, "grad_norm": 2.550488233566284, "learning_rate": 1.8416304119598307e-05, "loss": 1.9192, "step": 7760 }, { "epoch": 1.1041725801873403, "grad_norm": 2.4842491149902344, "learning_rate": 1.840824930957839e-05, "loss": 1.9316, "step": 7780 }, { "epoch": 1.1070110701107012, "grad_norm": 2.2662861347198486, "learning_rate": 1.8400175838577948e-05, "loss": 1.8928, "step": 7800 }, { "epoch": 1.1098495600340619, "grad_norm": 2.512302875518799, "learning_rate": 1.8392083724514974e-05, "loss": 1.9141, "step": 7820 }, { "epoch": 1.1126880499574225, "grad_norm": 2.4415171146392822, "learning_rate": 1.8383972985348822e-05, "loss": 1.9476, "step": 7840 }, { "epoch": 1.1155265398807834, "grad_norm": 2.4001150131225586, "learning_rate": 1.8375843639080194e-05, "loss": 1.8445, "step": 7860 }, { "epoch": 1.118365029804144, "grad_norm": 2.324253797531128, "learning_rate": 1.836769570375108e-05, "loss": 1.8567, "step": 7880 }, { "epoch": 1.121203519727505, "grad_norm": 2.5065720081329346, "learning_rate": 1.8359529197444724e-05, "loss": 1.8726, "step": 7900 }, { "epoch": 1.1240420096508656, "grad_norm": 2.5705080032348633, "learning_rate": 1.83513441382856e-05, "loss": 1.9179, "step": 7920 }, { "epoch": 1.1268804995742265, "grad_norm": 2.394152879714966, "learning_rate": 1.8343140544439344e-05, "loss": 1.9343, "step": 7940 }, { "epoch": 1.1297189894975872, "grad_norm": 2.396576166152954, "learning_rate": 1.833491843411273e-05, "loss": 1.9304, "step": 7960 }, { "epoch": 1.132557479420948, "grad_norm": 2.326009511947632, "learning_rate": 1.8326677825553627e-05, "loss": 1.8738, "step": 7980 }, { "epoch": 1.1353959693443088, "grad_norm": 2.336808681488037, "learning_rate": 1.831841873705096e-05, "loss": 1.896, "step": 8000 }, { "epoch": 1.1382344592676696, "grad_norm": 2.6412932872772217, "learning_rate": 1.8310141186934667e-05, "loss": 1.9107, "step": 8020 }, { "epoch": 1.1410729491910303, "grad_norm": 2.399062395095825, "learning_rate": 1.8301845193575666e-05, "loss": 1.9066, "step": 8040 }, { "epoch": 1.1439114391143912, "grad_norm": 2.488966703414917, "learning_rate": 1.829353077538579e-05, "loss": 1.9008, "step": 8060 }, { "epoch": 1.1467499290377519, "grad_norm": 2.44512677192688, "learning_rate": 1.8285197950817778e-05, "loss": 1.919, "step": 8080 }, { "epoch": 1.1495884189611127, "grad_norm": 2.3461754322052, "learning_rate": 1.827684673836522e-05, "loss": 1.9732, "step": 8100 }, { "epoch": 1.1524269088844734, "grad_norm": 2.39430570602417, "learning_rate": 1.8268477156562504e-05, "loss": 1.8872, "step": 8120 }, { "epoch": 1.1552653988078343, "grad_norm": 2.3630318641662598, "learning_rate": 1.82600892239848e-05, "loss": 1.9321, "step": 8140 }, { "epoch": 1.158103888731195, "grad_norm": 2.58503794670105, "learning_rate": 1.8251682959247988e-05, "loss": 1.9012, "step": 8160 }, { "epoch": 1.1609423786545559, "grad_norm": 2.3615262508392334, "learning_rate": 1.8243258381008654e-05, "loss": 1.9111, "step": 8180 }, { "epoch": 1.1637808685779165, "grad_norm": 2.561760663986206, "learning_rate": 1.8234815507964013e-05, "loss": 1.9339, "step": 8200 }, { "epoch": 1.1666193585012774, "grad_norm": 2.5927693843841553, "learning_rate": 1.8226354358851894e-05, "loss": 1.9765, "step": 8220 }, { "epoch": 1.169457848424638, "grad_norm": 2.5300285816192627, "learning_rate": 1.8217874952450673e-05, "loss": 1.917, "step": 8240 }, { "epoch": 1.172296338347999, "grad_norm": 2.3264412879943848, "learning_rate": 1.8209377307579262e-05, "loss": 1.8684, "step": 8260 }, { "epoch": 1.1751348282713596, "grad_norm": 2.5316264629364014, "learning_rate": 1.8200861443097037e-05, "loss": 1.8766, "step": 8280 }, { "epoch": 1.1779733181947205, "grad_norm": 2.5143356323242188, "learning_rate": 1.8192327377903823e-05, "loss": 1.9048, "step": 8300 }, { "epoch": 1.1808118081180812, "grad_norm": 2.371203660964966, "learning_rate": 1.8183775130939823e-05, "loss": 1.9057, "step": 8320 }, { "epoch": 1.1836502980414418, "grad_norm": 2.3475966453552246, "learning_rate": 1.817520472118561e-05, "loss": 1.8555, "step": 8340 }, { "epoch": 1.1864887879648027, "grad_norm": 2.472414255142212, "learning_rate": 1.816661616766205e-05, "loss": 1.931, "step": 8360 }, { "epoch": 1.1893272778881636, "grad_norm": 2.4851348400115967, "learning_rate": 1.815800948943029e-05, "loss": 1.8176, "step": 8380 }, { "epoch": 1.1921657678115243, "grad_norm": 2.4040234088897705, "learning_rate": 1.814938470559169e-05, "loss": 1.8733, "step": 8400 }, { "epoch": 1.195004257734885, "grad_norm": 2.594592571258545, "learning_rate": 1.814074183528781e-05, "loss": 1.8549, "step": 8420 }, { "epoch": 1.1978427476582458, "grad_norm": 2.3844246864318848, "learning_rate": 1.8132080897700338e-05, "loss": 1.9415, "step": 8440 }, { "epoch": 1.2006812375816065, "grad_norm": 2.433722734451294, "learning_rate": 1.812340191205105e-05, "loss": 1.9244, "step": 8460 }, { "epoch": 1.2035197275049674, "grad_norm": 2.44905686378479, "learning_rate": 1.8114704897601806e-05, "loss": 1.9013, "step": 8480 }, { "epoch": 1.206358217428328, "grad_norm": 2.3119888305664062, "learning_rate": 1.8105989873654453e-05, "loss": 1.8768, "step": 8500 }, { "epoch": 1.209196707351689, "grad_norm": 2.3774309158325195, "learning_rate": 1.8097256859550817e-05, "loss": 1.8692, "step": 8520 }, { "epoch": 1.2120351972750496, "grad_norm": 2.365177631378174, "learning_rate": 1.8088505874672654e-05, "loss": 1.898, "step": 8540 }, { "epoch": 1.2148736871984105, "grad_norm": 2.522331476211548, "learning_rate": 1.8079736938441596e-05, "loss": 1.9278, "step": 8560 }, { "epoch": 1.2177121771217712, "grad_norm": 2.355311393737793, "learning_rate": 1.807095007031912e-05, "loss": 1.8687, "step": 8580 }, { "epoch": 1.220550667045132, "grad_norm": 2.4324936866760254, "learning_rate": 1.8062145289806503e-05, "loss": 1.9011, "step": 8600 }, { "epoch": 1.2233891569684927, "grad_norm": 2.4804439544677734, "learning_rate": 1.8053322616444774e-05, "loss": 1.9496, "step": 8620 }, { "epoch": 1.2262276468918536, "grad_norm": 2.411646842956543, "learning_rate": 1.8044482069814672e-05, "loss": 1.8988, "step": 8640 }, { "epoch": 1.2290661368152143, "grad_norm": 2.494028091430664, "learning_rate": 1.8035623669536594e-05, "loss": 1.9506, "step": 8660 }, { "epoch": 1.2319046267385751, "grad_norm": 2.3650920391082764, "learning_rate": 1.8026747435270584e-05, "loss": 1.8913, "step": 8680 }, { "epoch": 1.2347431166619358, "grad_norm": 2.535071849822998, "learning_rate": 1.8017853386716243e-05, "loss": 1.922, "step": 8700 }, { "epoch": 1.2375816065852967, "grad_norm": 2.337268590927124, "learning_rate": 1.800894154361272e-05, "loss": 1.8679, "step": 8720 }, { "epoch": 1.2404200965086574, "grad_norm": 2.3973734378814697, "learning_rate": 1.800001192573866e-05, "loss": 1.9098, "step": 8740 }, { "epoch": 1.2432585864320183, "grad_norm": 2.4479072093963623, "learning_rate": 1.799106455291214e-05, "loss": 1.908, "step": 8760 }, { "epoch": 1.246097076355379, "grad_norm": 2.387651205062866, "learning_rate": 1.7982099444990662e-05, "loss": 1.8618, "step": 8780 }, { "epoch": 1.2489355662787398, "grad_norm": 2.339479684829712, "learning_rate": 1.7973116621871074e-05, "loss": 1.8937, "step": 8800 }, { "epoch": 1.2517740562021005, "grad_norm": 2.5908615589141846, "learning_rate": 1.796411610348955e-05, "loss": 1.8545, "step": 8820 }, { "epoch": 1.2546125461254611, "grad_norm": 2.539785623550415, "learning_rate": 1.795509790982153e-05, "loss": 1.9183, "step": 8840 }, { "epoch": 1.257451036048822, "grad_norm": 2.312326669692993, "learning_rate": 1.7946062060881683e-05, "loss": 1.9357, "step": 8860 }, { "epoch": 1.260289525972183, "grad_norm": 2.396221876144409, "learning_rate": 1.793700857672387e-05, "loss": 1.8623, "step": 8880 }, { "epoch": 1.2631280158955436, "grad_norm": 2.376504421234131, "learning_rate": 1.7927937477441074e-05, "loss": 1.9312, "step": 8900 }, { "epoch": 1.2659665058189042, "grad_norm": 2.4481639862060547, "learning_rate": 1.7918848783165382e-05, "loss": 1.8966, "step": 8920 }, { "epoch": 1.2688049957422651, "grad_norm": 2.4880106449127197, "learning_rate": 1.7909742514067942e-05, "loss": 1.9611, "step": 8940 }, { "epoch": 1.271643485665626, "grad_norm": 2.568020820617676, "learning_rate": 1.7900618690358884e-05, "loss": 1.8944, "step": 8960 }, { "epoch": 1.2744819755889867, "grad_norm": 2.4528987407684326, "learning_rate": 1.7891477332287315e-05, "loss": 1.8922, "step": 8980 }, { "epoch": 1.2773204655123473, "grad_norm": 2.4651987552642822, "learning_rate": 1.788231846014125e-05, "loss": 1.9234, "step": 9000 }, { "epoch": 1.2801589554357082, "grad_norm": 2.3549816608428955, "learning_rate": 1.787314209424758e-05, "loss": 1.8561, "step": 9020 }, { "epoch": 1.282997445359069, "grad_norm": 2.458906412124634, "learning_rate": 1.7863948254972013e-05, "loss": 1.9079, "step": 9040 }, { "epoch": 1.2858359352824298, "grad_norm": 2.453667640686035, "learning_rate": 1.7854736962719042e-05, "loss": 1.8826, "step": 9060 }, { "epoch": 1.2886744252057905, "grad_norm": 2.589059591293335, "learning_rate": 1.7845508237931896e-05, "loss": 1.8845, "step": 9080 }, { "epoch": 1.2915129151291513, "grad_norm": 2.2940869331359863, "learning_rate": 1.7836262101092488e-05, "loss": 1.9506, "step": 9100 }, { "epoch": 1.294351405052512, "grad_norm": 2.3912315368652344, "learning_rate": 1.7826998572721377e-05, "loss": 1.9061, "step": 9120 }, { "epoch": 1.297189894975873, "grad_norm": 2.3489718437194824, "learning_rate": 1.7817717673377722e-05, "loss": 1.88, "step": 9140 }, { "epoch": 1.3000283848992336, "grad_norm": 2.518566846847534, "learning_rate": 1.7808419423659235e-05, "loss": 1.8693, "step": 9160 }, { "epoch": 1.3028668748225944, "grad_norm": 2.3476552963256836, "learning_rate": 1.7799103844202128e-05, "loss": 1.9034, "step": 9180 }, { "epoch": 1.305705364745955, "grad_norm": 2.2228455543518066, "learning_rate": 1.7789770955681082e-05, "loss": 1.937, "step": 9200 }, { "epoch": 1.308543854669316, "grad_norm": 2.4339869022369385, "learning_rate": 1.778042077880919e-05, "loss": 1.9047, "step": 9220 }, { "epoch": 1.3113823445926767, "grad_norm": 2.370659112930298, "learning_rate": 1.7771053334337916e-05, "loss": 1.9222, "step": 9240 }, { "epoch": 1.3142208345160376, "grad_norm": 2.3733479976654053, "learning_rate": 1.7761668643057045e-05, "loss": 1.9125, "step": 9260 }, { "epoch": 1.3170593244393982, "grad_norm": 2.5406057834625244, "learning_rate": 1.7752266725794634e-05, "loss": 1.8782, "step": 9280 }, { "epoch": 1.319897814362759, "grad_norm": 2.3932764530181885, "learning_rate": 1.7742847603416982e-05, "loss": 1.891, "step": 9300 }, { "epoch": 1.3227363042861198, "grad_norm": 2.469506025314331, "learning_rate": 1.7733411296828563e-05, "loss": 1.8851, "step": 9320 }, { "epoch": 1.3255747942094804, "grad_norm": 2.4248979091644287, "learning_rate": 1.7723957826971996e-05, "loss": 1.8924, "step": 9340 }, { "epoch": 1.3284132841328413, "grad_norm": 2.4909257888793945, "learning_rate": 1.7714487214827988e-05, "loss": 1.9729, "step": 9360 }, { "epoch": 1.3312517740562022, "grad_norm": 2.401510238647461, "learning_rate": 1.7704999481415287e-05, "loss": 1.9161, "step": 9380 }, { "epoch": 1.3340902639795629, "grad_norm": 2.424142360687256, "learning_rate": 1.7695494647790646e-05, "loss": 1.8937, "step": 9400 }, { "epoch": 1.3369287539029235, "grad_norm": 2.611459732055664, "learning_rate": 1.7685972735048768e-05, "loss": 1.873, "step": 9420 }, { "epoch": 1.3397672438262844, "grad_norm": 2.390551805496216, "learning_rate": 1.767643376432226e-05, "loss": 1.9358, "step": 9440 }, { "epoch": 1.3426057337496453, "grad_norm": 2.3363115787506104, "learning_rate": 1.766687775678159e-05, "loss": 1.8879, "step": 9460 }, { "epoch": 1.345444223673006, "grad_norm": 2.462367057800293, "learning_rate": 1.7657304733635024e-05, "loss": 1.9296, "step": 9480 }, { "epoch": 1.3482827135963666, "grad_norm": 2.2522923946380615, "learning_rate": 1.764771471612861e-05, "loss": 1.8717, "step": 9500 }, { "epoch": 1.3511212035197275, "grad_norm": 2.339628219604492, "learning_rate": 1.7638107725546104e-05, "loss": 1.9276, "step": 9520 }, { "epoch": 1.3539596934430882, "grad_norm": 2.485044479370117, "learning_rate": 1.762848378320893e-05, "loss": 1.9061, "step": 9540 }, { "epoch": 1.356798183366449, "grad_norm": 2.3764455318450928, "learning_rate": 1.7618842910476135e-05, "loss": 1.8921, "step": 9560 }, { "epoch": 1.3596366732898097, "grad_norm": 2.4634227752685547, "learning_rate": 1.7609185128744342e-05, "loss": 1.8331, "step": 9580 }, { "epoch": 1.3624751632131706, "grad_norm": 2.3391668796539307, "learning_rate": 1.7599510459447703e-05, "loss": 1.8678, "step": 9600 }, { "epoch": 1.3653136531365313, "grad_norm": 2.519028902053833, "learning_rate": 1.7589818924057846e-05, "loss": 1.8713, "step": 9620 }, { "epoch": 1.3681521430598922, "grad_norm": 2.591221570968628, "learning_rate": 1.7580110544083837e-05, "loss": 1.8955, "step": 9640 }, { "epoch": 1.3709906329832529, "grad_norm": 2.4363555908203125, "learning_rate": 1.757038534107211e-05, "loss": 1.8748, "step": 9660 }, { "epoch": 1.3738291229066137, "grad_norm": 2.38515305519104, "learning_rate": 1.7560643336606462e-05, "loss": 1.912, "step": 9680 }, { "epoch": 1.3766676128299744, "grad_norm": 2.5047712326049805, "learning_rate": 1.7550884552307956e-05, "loss": 1.9127, "step": 9700 }, { "epoch": 1.3795061027533353, "grad_norm": 2.442993402481079, "learning_rate": 1.7541109009834905e-05, "loss": 1.8694, "step": 9720 }, { "epoch": 1.382344592676696, "grad_norm": 2.2701942920684814, "learning_rate": 1.7531316730882812e-05, "loss": 1.8699, "step": 9740 }, { "epoch": 1.3851830826000568, "grad_norm": 2.31447172164917, "learning_rate": 1.7521507737184328e-05, "loss": 1.8963, "step": 9760 }, { "epoch": 1.3880215725234175, "grad_norm": 2.442223310470581, "learning_rate": 1.75116820505092e-05, "loss": 1.8888, "step": 9780 }, { "epoch": 1.3908600624467784, "grad_norm": 2.2749106884002686, "learning_rate": 1.7501839692664214e-05, "loss": 1.8814, "step": 9800 }, { "epoch": 1.393698552370139, "grad_norm": 2.2991209030151367, "learning_rate": 1.7491980685493163e-05, "loss": 1.8934, "step": 9820 }, { "epoch": 1.3965370422934997, "grad_norm": 2.330522298812866, "learning_rate": 1.7482105050876794e-05, "loss": 1.8762, "step": 9840 }, { "epoch": 1.3993755322168606, "grad_norm": 2.350338935852051, "learning_rate": 1.747221281073275e-05, "loss": 1.8954, "step": 9860 }, { "epoch": 1.4022140221402215, "grad_norm": 2.2585582733154297, "learning_rate": 1.746230398701553e-05, "loss": 1.8821, "step": 9880 }, { "epoch": 1.4050525120635822, "grad_norm": 2.45865535736084, "learning_rate": 1.7452378601716433e-05, "loss": 1.9317, "step": 9900 }, { "epoch": 1.4078910019869428, "grad_norm": 2.52109694480896, "learning_rate": 1.744243667686353e-05, "loss": 1.8839, "step": 9920 }, { "epoch": 1.4107294919103037, "grad_norm": 2.3874857425689697, "learning_rate": 1.743247823452158e-05, "loss": 1.9028, "step": 9940 }, { "epoch": 1.4135679818336646, "grad_norm": 2.514723300933838, "learning_rate": 1.7422503296792013e-05, "loss": 1.8579, "step": 9960 }, { "epoch": 1.4164064717570253, "grad_norm": 2.385085105895996, "learning_rate": 1.741251188581286e-05, "loss": 1.9079, "step": 9980 }, { "epoch": 1.419244961680386, "grad_norm": 2.432723045349121, "learning_rate": 1.740250402375872e-05, "loss": 1.8599, "step": 10000 }, { "epoch": 1.4220834516037468, "grad_norm": 2.464299440383911, "learning_rate": 1.7392479732840698e-05, "loss": 1.818, "step": 10020 }, { "epoch": 1.4249219415271077, "grad_norm": 2.389589548110962, "learning_rate": 1.7382439035306364e-05, "loss": 1.89, "step": 10040 }, { "epoch": 1.4277604314504684, "grad_norm": 2.385801315307617, "learning_rate": 1.737238195343969e-05, "loss": 1.9035, "step": 10060 }, { "epoch": 1.430598921373829, "grad_norm": 2.426624298095703, "learning_rate": 1.736230850956103e-05, "loss": 1.8768, "step": 10080 }, { "epoch": 1.43343741129719, "grad_norm": 2.513197660446167, "learning_rate": 1.7352218726027036e-05, "loss": 1.9404, "step": 10100 }, { "epoch": 1.4362759012205506, "grad_norm": 2.4197587966918945, "learning_rate": 1.734211262523063e-05, "loss": 1.8883, "step": 10120 }, { "epoch": 1.4391143911439115, "grad_norm": 2.4089245796203613, "learning_rate": 1.733199022960094e-05, "loss": 1.9444, "step": 10140 }, { "epoch": 1.4419528810672722, "grad_norm": 2.416811943054199, "learning_rate": 1.7321851561603265e-05, "loss": 1.8668, "step": 10160 }, { "epoch": 1.444791370990633, "grad_norm": 2.454352378845215, "learning_rate": 1.731169664373902e-05, "loss": 1.9298, "step": 10180 }, { "epoch": 1.4476298609139937, "grad_norm": 2.267517328262329, "learning_rate": 1.7301525498545685e-05, "loss": 1.8619, "step": 10200 }, { "epoch": 1.4504683508373546, "grad_norm": 2.430478811264038, "learning_rate": 1.729133814859675e-05, "loss": 1.9103, "step": 10220 }, { "epoch": 1.4533068407607153, "grad_norm": 2.2854342460632324, "learning_rate": 1.7281134616501668e-05, "loss": 1.9077, "step": 10240 }, { "epoch": 1.4561453306840761, "grad_norm": 2.388291597366333, "learning_rate": 1.7270914924905805e-05, "loss": 1.9075, "step": 10260 }, { "epoch": 1.4589838206074368, "grad_norm": 2.5017805099487305, "learning_rate": 1.7260679096490403e-05, "loss": 1.8819, "step": 10280 }, { "epoch": 1.4618223105307977, "grad_norm": 2.3712542057037354, "learning_rate": 1.72504271539725e-05, "loss": 1.873, "step": 10300 }, { "epoch": 1.4646608004541584, "grad_norm": 2.3165364265441895, "learning_rate": 1.7240159120104917e-05, "loss": 1.9018, "step": 10320 }, { "epoch": 1.467499290377519, "grad_norm": 2.4430882930755615, "learning_rate": 1.722987501767617e-05, "loss": 1.9228, "step": 10340 }, { "epoch": 1.47033778030088, "grad_norm": 2.465818405151367, "learning_rate": 1.721957486951044e-05, "loss": 1.8946, "step": 10360 }, { "epoch": 1.4731762702242408, "grad_norm": 2.535247802734375, "learning_rate": 1.720925869846753e-05, "loss": 1.8976, "step": 10380 }, { "epoch": 1.4760147601476015, "grad_norm": 2.323089599609375, "learning_rate": 1.7198926527442795e-05, "loss": 1.8806, "step": 10400 }, { "epoch": 1.4788532500709621, "grad_norm": 2.33777117729187, "learning_rate": 1.7188578379367097e-05, "loss": 1.8709, "step": 10420 }, { "epoch": 1.481691739994323, "grad_norm": 2.329279899597168, "learning_rate": 1.7178214277206768e-05, "loss": 1.9125, "step": 10440 }, { "epoch": 1.484530229917684, "grad_norm": 2.627242088317871, "learning_rate": 1.7167834243963536e-05, "loss": 1.879, "step": 10460 }, { "epoch": 1.4873687198410446, "grad_norm": 2.1803481578826904, "learning_rate": 1.7157438302674495e-05, "loss": 1.8418, "step": 10480 }, { "epoch": 1.4902072097644052, "grad_norm": 2.3069684505462646, "learning_rate": 1.714702647641204e-05, "loss": 1.8778, "step": 10500 }, { "epoch": 1.4930456996877661, "grad_norm": 2.377667188644409, "learning_rate": 1.7136598788283822e-05, "loss": 1.8782, "step": 10520 }, { "epoch": 1.495884189611127, "grad_norm": 2.438920021057129, "learning_rate": 1.712615526143269e-05, "loss": 1.8815, "step": 10540 }, { "epoch": 1.4987226795344877, "grad_norm": 2.288480520248413, "learning_rate": 1.7115695919036656e-05, "loss": 1.8642, "step": 10560 }, { "epoch": 1.5015611694578483, "grad_norm": 2.475261926651001, "learning_rate": 1.7105220784308824e-05, "loss": 1.9052, "step": 10580 }, { "epoch": 1.5043996593812092, "grad_norm": 2.3163247108459473, "learning_rate": 1.709472988049735e-05, "loss": 1.8888, "step": 10600 }, { "epoch": 1.5072381493045701, "grad_norm": 2.2453675270080566, "learning_rate": 1.7084223230885384e-05, "loss": 1.9016, "step": 10620 }, { "epoch": 1.5100766392279308, "grad_norm": 2.3576395511627197, "learning_rate": 1.7073700858791027e-05, "loss": 1.9045, "step": 10640 }, { "epoch": 1.5129151291512914, "grad_norm": 2.353783130645752, "learning_rate": 1.706316278756727e-05, "loss": 1.9231, "step": 10660 }, { "epoch": 1.5157536190746523, "grad_norm": 2.3332369327545166, "learning_rate": 1.705260904060195e-05, "loss": 1.9004, "step": 10680 }, { "epoch": 1.518592108998013, "grad_norm": 2.341947078704834, "learning_rate": 1.7042039641317685e-05, "loss": 1.883, "step": 10700 }, { "epoch": 1.521430598921374, "grad_norm": 2.3840863704681396, "learning_rate": 1.7031454613171845e-05, "loss": 1.8744, "step": 10720 }, { "epoch": 1.5242690888447346, "grad_norm": 2.3357958793640137, "learning_rate": 1.7020853979656477e-05, "loss": 1.8453, "step": 10740 }, { "epoch": 1.5271075787680952, "grad_norm": 2.447324275970459, "learning_rate": 1.7010237764298255e-05, "loss": 1.8432, "step": 10760 }, { "epoch": 1.529946068691456, "grad_norm": 2.3025155067443848, "learning_rate": 1.699960599065846e-05, "loss": 1.8483, "step": 10780 }, { "epoch": 1.532784558614817, "grad_norm": 2.229663610458374, "learning_rate": 1.6988958682332874e-05, "loss": 1.875, "step": 10800 }, { "epoch": 1.5356230485381777, "grad_norm": 2.2518868446350098, "learning_rate": 1.6978295862951772e-05, "loss": 1.881, "step": 10820 }, { "epoch": 1.5384615384615383, "grad_norm": 2.4646663665771484, "learning_rate": 1.696761755617985e-05, "loss": 1.9554, "step": 10840 }, { "epoch": 1.5413000283848992, "grad_norm": 2.3364551067352295, "learning_rate": 1.6956923785716178e-05, "loss": 1.9033, "step": 10860 }, { "epoch": 1.54413851830826, "grad_norm": 2.4326839447021484, "learning_rate": 1.6946214575294148e-05, "loss": 1.8963, "step": 10880 }, { "epoch": 1.5469770082316208, "grad_norm": 2.317146062850952, "learning_rate": 1.6935489948681398e-05, "loss": 1.8939, "step": 10900 }, { "epoch": 1.5498154981549814, "grad_norm": 2.481961488723755, "learning_rate": 1.6924749929679817e-05, "loss": 1.8985, "step": 10920 }, { "epoch": 1.5526539880783423, "grad_norm": 2.4172651767730713, "learning_rate": 1.691399454212542e-05, "loss": 1.8872, "step": 10940 }, { "epoch": 1.5554924780017032, "grad_norm": 2.42179799079895, "learning_rate": 1.6903223809888358e-05, "loss": 1.9202, "step": 10960 }, { "epoch": 1.5583309679250639, "grad_norm": 2.2576653957366943, "learning_rate": 1.6892437756872818e-05, "loss": 1.8417, "step": 10980 }, { "epoch": 1.5611694578484245, "grad_norm": 2.2595551013946533, "learning_rate": 1.6881636407016996e-05, "loss": 1.8814, "step": 11000 }, { "epoch": 1.5640079477717854, "grad_norm": 2.2952046394348145, "learning_rate": 1.687081978429304e-05, "loss": 1.9187, "step": 11020 }, { "epoch": 1.5668464376951463, "grad_norm": 2.3288421630859375, "learning_rate": 1.6859987912706995e-05, "loss": 1.9255, "step": 11040 }, { "epoch": 1.569684927618507, "grad_norm": 2.321136474609375, "learning_rate": 1.684914081629874e-05, "loss": 1.9298, "step": 11060 }, { "epoch": 1.5725234175418676, "grad_norm": 2.510637044906616, "learning_rate": 1.6838278519141954e-05, "loss": 1.896, "step": 11080 }, { "epoch": 1.5753619074652285, "grad_norm": 2.472397804260254, "learning_rate": 1.6827401045344046e-05, "loss": 1.9004, "step": 11100 }, { "epoch": 1.5782003973885894, "grad_norm": 2.413074254989624, "learning_rate": 1.6816508419046104e-05, "loss": 1.9367, "step": 11120 }, { "epoch": 1.58103888731195, "grad_norm": 2.338806390762329, "learning_rate": 1.6805600664422856e-05, "loss": 1.8818, "step": 11140 }, { "epoch": 1.5838773772353107, "grad_norm": 2.3642308712005615, "learning_rate": 1.67946778056826e-05, "loss": 1.9199, "step": 11160 }, { "epoch": 1.5867158671586716, "grad_norm": 2.1568758487701416, "learning_rate": 1.6783739867067147e-05, "loss": 1.8853, "step": 11180 }, { "epoch": 1.5895543570820325, "grad_norm": 2.4399914741516113, "learning_rate": 1.677278687285179e-05, "loss": 1.9235, "step": 11200 }, { "epoch": 1.5923928470053932, "grad_norm": 2.3465065956115723, "learning_rate": 1.6761818847345223e-05, "loss": 1.8591, "step": 11220 }, { "epoch": 1.5952313369287539, "grad_norm": 2.4309961795806885, "learning_rate": 1.6750835814889515e-05, "loss": 1.9292, "step": 11240 }, { "epoch": 1.5980698268521145, "grad_norm": 2.4436988830566406, "learning_rate": 1.6739837799860022e-05, "loss": 1.8875, "step": 11260 }, { "epoch": 1.6009083167754754, "grad_norm": 2.309965133666992, "learning_rate": 1.672882482666537e-05, "loss": 1.8907, "step": 11280 }, { "epoch": 1.6037468066988363, "grad_norm": 2.38466215133667, "learning_rate": 1.671779691974737e-05, "loss": 1.8834, "step": 11300 }, { "epoch": 1.606585296622197, "grad_norm": 2.3375051021575928, "learning_rate": 1.6706754103580987e-05, "loss": 1.8347, "step": 11320 }, { "epoch": 1.6094237865455576, "grad_norm": 2.3268837928771973, "learning_rate": 1.6695696402674258e-05, "loss": 1.8949, "step": 11340 }, { "epoch": 1.6122622764689185, "grad_norm": 2.534078359603882, "learning_rate": 1.6684623841568276e-05, "loss": 1.9326, "step": 11360 }, { "epoch": 1.6151007663922794, "grad_norm": 2.2717092037200928, "learning_rate": 1.66735364448371e-05, "loss": 1.8942, "step": 11380 }, { "epoch": 1.61793925631564, "grad_norm": 2.1735470294952393, "learning_rate": 1.6662434237087718e-05, "loss": 1.8889, "step": 11400 }, { "epoch": 1.6207777462390007, "grad_norm": 2.3566861152648926, "learning_rate": 1.6651317242959987e-05, "loss": 1.8851, "step": 11420 }, { "epoch": 1.6236162361623616, "grad_norm": 2.3768975734710693, "learning_rate": 1.6640185487126587e-05, "loss": 1.8652, "step": 11440 }, { "epoch": 1.6264547260857225, "grad_norm": 2.2751026153564453, "learning_rate": 1.6629038994292956e-05, "loss": 1.9424, "step": 11460 }, { "epoch": 1.6292932160090832, "grad_norm": 2.325651168823242, "learning_rate": 1.6617877789197235e-05, "loss": 1.9077, "step": 11480 }, { "epoch": 1.6321317059324438, "grad_norm": 2.363680362701416, "learning_rate": 1.6606701896610224e-05, "loss": 1.8364, "step": 11500 }, { "epoch": 1.6349701958558047, "grad_norm": 2.431579828262329, "learning_rate": 1.6595511341335315e-05, "loss": 1.9281, "step": 11520 }, { "epoch": 1.6378086857791656, "grad_norm": 2.38569974899292, "learning_rate": 1.658430614820844e-05, "loss": 1.8872, "step": 11540 }, { "epoch": 1.6406471757025263, "grad_norm": 2.4201438426971436, "learning_rate": 1.6573086342098028e-05, "loss": 1.9172, "step": 11560 }, { "epoch": 1.643485665625887, "grad_norm": 2.2831990718841553, "learning_rate": 1.6561851947904923e-05, "loss": 1.8921, "step": 11580 }, { "epoch": 1.6463241555492478, "grad_norm": 2.3878748416900635, "learning_rate": 1.655060299056236e-05, "loss": 1.8637, "step": 11600 }, { "epoch": 1.6491626454726087, "grad_norm": 2.408562183380127, "learning_rate": 1.653933949503589e-05, "loss": 1.961, "step": 11620 }, { "epoch": 1.6520011353959694, "grad_norm": 2.378300666809082, "learning_rate": 1.6528061486323328e-05, "loss": 1.8576, "step": 11640 }, { "epoch": 1.65483962531933, "grad_norm": 2.2782750129699707, "learning_rate": 1.65167689894547e-05, "loss": 1.8688, "step": 11660 }, { "epoch": 1.657678115242691, "grad_norm": 2.3620054721832275, "learning_rate": 1.650546202949219e-05, "loss": 1.8344, "step": 11680 }, { "epoch": 1.6605166051660518, "grad_norm": 2.2618308067321777, "learning_rate": 1.649414063153007e-05, "loss": 1.8942, "step": 11700 }, { "epoch": 1.6633550950894125, "grad_norm": 2.301419258117676, "learning_rate": 1.6482804820694673e-05, "loss": 1.8779, "step": 11720 }, { "epoch": 1.6661935850127731, "grad_norm": 2.407195568084717, "learning_rate": 1.64714546221443e-05, "loss": 1.9279, "step": 11740 }, { "epoch": 1.6690320749361338, "grad_norm": 2.41135835647583, "learning_rate": 1.64600900610692e-05, "loss": 1.8985, "step": 11760 }, { "epoch": 1.6718705648594947, "grad_norm": 2.2730157375335693, "learning_rate": 1.6448711162691486e-05, "loss": 1.8555, "step": 11780 }, { "epoch": 1.6747090547828556, "grad_norm": 2.3802435398101807, "learning_rate": 1.6437317952265094e-05, "loss": 1.8813, "step": 11800 }, { "epoch": 1.6775475447062163, "grad_norm": 2.356393814086914, "learning_rate": 1.6425910455075734e-05, "loss": 1.8994, "step": 11820 }, { "epoch": 1.680386034629577, "grad_norm": 2.360194683074951, "learning_rate": 1.6414488696440803e-05, "loss": 1.8672, "step": 11840 }, { "epoch": 1.6832245245529378, "grad_norm": 2.483050584793091, "learning_rate": 1.640305270170937e-05, "loss": 1.9023, "step": 11860 }, { "epoch": 1.6860630144762987, "grad_norm": 2.2262065410614014, "learning_rate": 1.6391602496262082e-05, "loss": 1.8562, "step": 11880 }, { "epoch": 1.6889015043996594, "grad_norm": 2.3203928470611572, "learning_rate": 1.638013810551114e-05, "loss": 1.8425, "step": 11900 }, { "epoch": 1.69173999432302, "grad_norm": 2.280611991882324, "learning_rate": 1.6368659554900217e-05, "loss": 1.9063, "step": 11920 }, { "epoch": 1.694578484246381, "grad_norm": 2.471146821975708, "learning_rate": 1.6357166869904416e-05, "loss": 1.8786, "step": 11940 }, { "epoch": 1.6974169741697418, "grad_norm": 2.3377063274383545, "learning_rate": 1.6345660076030208e-05, "loss": 1.86, "step": 11960 }, { "epoch": 1.7002554640931025, "grad_norm": 2.4441287517547607, "learning_rate": 1.6334139198815374e-05, "loss": 1.8538, "step": 11980 }, { "epoch": 1.7030939540164631, "grad_norm": 2.321810722351074, "learning_rate": 1.632260426382896e-05, "loss": 1.8992, "step": 12000 }, { "epoch": 1.705932443939824, "grad_norm": 2.3958258628845215, "learning_rate": 1.6311055296671203e-05, "loss": 1.8523, "step": 12020 }, { "epoch": 1.708770933863185, "grad_norm": 2.31341814994812, "learning_rate": 1.6299492322973483e-05, "loss": 1.9228, "step": 12040 }, { "epoch": 1.7116094237865456, "grad_norm": 2.275038003921509, "learning_rate": 1.628791536839827e-05, "loss": 1.891, "step": 12060 }, { "epoch": 1.7144479137099062, "grad_norm": 2.3721940517425537, "learning_rate": 1.6276324458639062e-05, "loss": 1.9238, "step": 12080 }, { "epoch": 1.7172864036332671, "grad_norm": 2.253066301345825, "learning_rate": 1.6264719619420323e-05, "loss": 1.8799, "step": 12100 }, { "epoch": 1.720124893556628, "grad_norm": 2.3514933586120605, "learning_rate": 1.6253100876497438e-05, "loss": 1.8629, "step": 12120 }, { "epoch": 1.7229633834799887, "grad_norm": 2.346400499343872, "learning_rate": 1.624146825565665e-05, "loss": 1.8882, "step": 12140 }, { "epoch": 1.7258018734033493, "grad_norm": 2.2781994342803955, "learning_rate": 1.6229821782714994e-05, "loss": 1.8822, "step": 12160 }, { "epoch": 1.7286403633267102, "grad_norm": 2.35489559173584, "learning_rate": 1.6218161483520257e-05, "loss": 1.8894, "step": 12180 }, { "epoch": 1.7314788532500711, "grad_norm": 2.297468662261963, "learning_rate": 1.6206487383950903e-05, "loss": 1.8755, "step": 12200 }, { "epoch": 1.7343173431734318, "grad_norm": 2.4580492973327637, "learning_rate": 1.6194799509916033e-05, "loss": 1.8746, "step": 12220 }, { "epoch": 1.7371558330967924, "grad_norm": 2.3647725582122803, "learning_rate": 1.6183097887355313e-05, "loss": 1.9103, "step": 12240 }, { "epoch": 1.7399943230201533, "grad_norm": 2.2950212955474854, "learning_rate": 1.6171382542238925e-05, "loss": 1.9133, "step": 12260 }, { "epoch": 1.742832812943514, "grad_norm": 2.29827618598938, "learning_rate": 1.6159653500567505e-05, "loss": 1.8834, "step": 12280 }, { "epoch": 1.7456713028668749, "grad_norm": 2.469423770904541, "learning_rate": 1.614791078837208e-05, "loss": 1.8752, "step": 12300 }, { "epoch": 1.7485097927902356, "grad_norm": 2.2650680541992188, "learning_rate": 1.6136154431714032e-05, "loss": 1.9139, "step": 12320 }, { "epoch": 1.7513482827135962, "grad_norm": 2.4716033935546875, "learning_rate": 1.6124384456685007e-05, "loss": 1.8185, "step": 12340 }, { "epoch": 1.754186772636957, "grad_norm": 2.453181266784668, "learning_rate": 1.6112600889406894e-05, "loss": 1.8859, "step": 12360 }, { "epoch": 1.757025262560318, "grad_norm": 2.509669303894043, "learning_rate": 1.610080375603173e-05, "loss": 1.9151, "step": 12380 }, { "epoch": 1.7598637524836787, "grad_norm": 2.282261848449707, "learning_rate": 1.6088993082741674e-05, "loss": 1.9166, "step": 12400 }, { "epoch": 1.7627022424070393, "grad_norm": 2.3991317749023438, "learning_rate": 1.607716889574893e-05, "loss": 1.9037, "step": 12420 }, { "epoch": 1.7655407323304002, "grad_norm": 2.3945040702819824, "learning_rate": 1.606533122129569e-05, "loss": 1.8418, "step": 12440 }, { "epoch": 1.768379222253761, "grad_norm": 2.29438853263855, "learning_rate": 1.6053480085654086e-05, "loss": 1.8763, "step": 12460 }, { "epoch": 1.7712177121771218, "grad_norm": 2.33113956451416, "learning_rate": 1.6041615515126126e-05, "loss": 1.881, "step": 12480 }, { "epoch": 1.7740562021004824, "grad_norm": 2.4556398391723633, "learning_rate": 1.6029737536043623e-05, "loss": 1.8406, "step": 12500 }, { "epoch": 1.7768946920238433, "grad_norm": 2.2516586780548096, "learning_rate": 1.6017846174768163e-05, "loss": 1.8227, "step": 12520 }, { "epoch": 1.7797331819472042, "grad_norm": 2.3786494731903076, "learning_rate": 1.600594145769102e-05, "loss": 1.8563, "step": 12540 }, { "epoch": 1.7825716718705649, "grad_norm": 2.4075264930725098, "learning_rate": 1.5994023411233126e-05, "loss": 1.9556, "step": 12560 }, { "epoch": 1.7854101617939255, "grad_norm": 2.325441360473633, "learning_rate": 1.5982092061844978e-05, "loss": 1.8999, "step": 12580 }, { "epoch": 1.7882486517172864, "grad_norm": 2.397364616394043, "learning_rate": 1.59701474360066e-05, "loss": 1.9016, "step": 12600 }, { "epoch": 1.7910871416406473, "grad_norm": 2.2097063064575195, "learning_rate": 1.5958189560227492e-05, "loss": 1.8539, "step": 12620 }, { "epoch": 1.793925631564008, "grad_norm": 2.243323802947998, "learning_rate": 1.5946218461046554e-05, "loss": 1.8468, "step": 12640 }, { "epoch": 1.7967641214873686, "grad_norm": 2.193223714828491, "learning_rate": 1.593423416503203e-05, "loss": 1.8779, "step": 12660 }, { "epoch": 1.7996026114107295, "grad_norm": 2.241441011428833, "learning_rate": 1.5922236698781456e-05, "loss": 1.917, "step": 12680 }, { "epoch": 1.8024411013340904, "grad_norm": 2.3050918579101562, "learning_rate": 1.5910226088921607e-05, "loss": 1.9112, "step": 12700 }, { "epoch": 1.805279591257451, "grad_norm": 2.397857189178467, "learning_rate": 1.5898202362108403e-05, "loss": 1.8359, "step": 12720 }, { "epoch": 1.8081180811808117, "grad_norm": 2.413483142852783, "learning_rate": 1.5886165545026902e-05, "loss": 1.8904, "step": 12740 }, { "epoch": 1.8109565711041726, "grad_norm": 2.510800361633301, "learning_rate": 1.5874115664391206e-05, "loss": 1.902, "step": 12760 }, { "epoch": 1.8137950610275335, "grad_norm": 2.260368824005127, "learning_rate": 1.5862052746944397e-05, "loss": 1.9096, "step": 12780 }, { "epoch": 1.8166335509508942, "grad_norm": 2.2758848667144775, "learning_rate": 1.584997681945851e-05, "loss": 1.8752, "step": 12800 }, { "epoch": 1.8194720408742548, "grad_norm": 2.2785396575927734, "learning_rate": 1.5837887908734443e-05, "loss": 1.8959, "step": 12820 }, { "epoch": 1.8223105307976155, "grad_norm": 2.145648956298828, "learning_rate": 1.5825786041601907e-05, "loss": 1.8414, "step": 12840 }, { "epoch": 1.8251490207209764, "grad_norm": 2.248533248901367, "learning_rate": 1.581367124491937e-05, "loss": 1.8682, "step": 12860 }, { "epoch": 1.8279875106443373, "grad_norm": 2.370220899581909, "learning_rate": 1.5801543545574005e-05, "loss": 1.8618, "step": 12880 }, { "epoch": 1.830826000567698, "grad_norm": 2.3699800968170166, "learning_rate": 1.5789402970481606e-05, "loss": 1.8814, "step": 12900 }, { "epoch": 1.8336644904910586, "grad_norm": 2.3689448833465576, "learning_rate": 1.5777249546586552e-05, "loss": 1.8881, "step": 12920 }, { "epoch": 1.8365029804144195, "grad_norm": 2.4284286499023438, "learning_rate": 1.576508330086173e-05, "loss": 1.8364, "step": 12940 }, { "epoch": 1.8393414703377804, "grad_norm": 2.360368013381958, "learning_rate": 1.5752904260308502e-05, "loss": 1.8285, "step": 12960 }, { "epoch": 1.842179960261141, "grad_norm": 2.5541396141052246, "learning_rate": 1.57407124519566e-05, "loss": 1.9108, "step": 12980 }, { "epoch": 1.8450184501845017, "grad_norm": 2.281961441040039, "learning_rate": 1.5728507902864114e-05, "loss": 1.8824, "step": 13000 }, { "epoch": 1.8478569401078626, "grad_norm": 2.376593589782715, "learning_rate": 1.57162906401174e-05, "loss": 1.9152, "step": 13020 }, { "epoch": 1.8506954300312235, "grad_norm": 2.500288724899292, "learning_rate": 1.570406069083103e-05, "loss": 1.8922, "step": 13040 }, { "epoch": 1.8535339199545842, "grad_norm": 2.2535808086395264, "learning_rate": 1.5691818082147736e-05, "loss": 1.8704, "step": 13060 }, { "epoch": 1.8563724098779448, "grad_norm": 2.2436044216156006, "learning_rate": 1.5679562841238345e-05, "loss": 1.927, "step": 13080 }, { "epoch": 1.8592108998013057, "grad_norm": 2.3640642166137695, "learning_rate": 1.5667294995301723e-05, "loss": 1.9032, "step": 13100 }, { "epoch": 1.8620493897246666, "grad_norm": 2.2753047943115234, "learning_rate": 1.56550145715647e-05, "loss": 1.8526, "step": 13120 }, { "epoch": 1.8648878796480273, "grad_norm": 2.5170915126800537, "learning_rate": 1.564272159728203e-05, "loss": 1.8799, "step": 13140 }, { "epoch": 1.867726369571388, "grad_norm": 2.264301061630249, "learning_rate": 1.563041609973632e-05, "loss": 1.8832, "step": 13160 }, { "epoch": 1.8705648594947488, "grad_norm": 2.295016050338745, "learning_rate": 1.561809810623797e-05, "loss": 1.8998, "step": 13180 }, { "epoch": 1.8734033494181097, "grad_norm": 2.4556353092193604, "learning_rate": 1.5605767644125112e-05, "loss": 1.8918, "step": 13200 }, { "epoch": 1.8762418393414704, "grad_norm": 2.345303535461426, "learning_rate": 1.559342474076355e-05, "loss": 1.9161, "step": 13220 }, { "epoch": 1.879080329264831, "grad_norm": 2.446317195892334, "learning_rate": 1.5581069423546707e-05, "loss": 1.8727, "step": 13240 }, { "epoch": 1.881918819188192, "grad_norm": 2.3398241996765137, "learning_rate": 1.5568701719895546e-05, "loss": 1.9343, "step": 13260 }, { "epoch": 1.8847573091115528, "grad_norm": 2.2129993438720703, "learning_rate": 1.5556321657258522e-05, "loss": 1.874, "step": 13280 }, { "epoch": 1.8875957990349135, "grad_norm": 2.3318018913269043, "learning_rate": 1.554392926311153e-05, "loss": 1.8819, "step": 13300 }, { "epoch": 1.8904342889582741, "grad_norm": 2.4746787548065186, "learning_rate": 1.5531524564957814e-05, "loss": 1.9103, "step": 13320 }, { "epoch": 1.8932727788816348, "grad_norm": 2.2729218006134033, "learning_rate": 1.5519107590327945e-05, "loss": 1.835, "step": 13340 }, { "epoch": 1.8961112688049957, "grad_norm": 2.252549648284912, "learning_rate": 1.5506678366779732e-05, "loss": 1.8504, "step": 13360 }, { "epoch": 1.8989497587283566, "grad_norm": 2.310727834701538, "learning_rate": 1.549423692189816e-05, "loss": 1.9156, "step": 13380 }, { "epoch": 1.9017882486517173, "grad_norm": 2.3093137741088867, "learning_rate": 1.5481783283295355e-05, "loss": 1.8351, "step": 13400 }, { "epoch": 1.904626738575078, "grad_norm": 2.3642239570617676, "learning_rate": 1.546931747861049e-05, "loss": 1.9063, "step": 13420 }, { "epoch": 1.9074652284984388, "grad_norm": 2.112760305404663, "learning_rate": 1.5456839535509748e-05, "loss": 1.8657, "step": 13440 }, { "epoch": 1.9103037184217997, "grad_norm": 2.2627687454223633, "learning_rate": 1.5444349481686246e-05, "loss": 1.8897, "step": 13460 }, { "epoch": 1.9131422083451604, "grad_norm": 2.297877550125122, "learning_rate": 1.543184734485998e-05, "loss": 1.8309, "step": 13480 }, { "epoch": 1.915980698268521, "grad_norm": 2.3885910511016846, "learning_rate": 1.5419333152777768e-05, "loss": 1.9025, "step": 13500 }, { "epoch": 1.918819188191882, "grad_norm": 2.304903030395508, "learning_rate": 1.540680693321318e-05, "loss": 1.8846, "step": 13520 }, { "epoch": 1.9216576781152428, "grad_norm": 2.3368923664093018, "learning_rate": 1.5394268713966475e-05, "loss": 1.8361, "step": 13540 }, { "epoch": 1.9244961680386035, "grad_norm": 2.295138120651245, "learning_rate": 1.538171852286455e-05, "loss": 1.8568, "step": 13560 }, { "epoch": 1.9273346579619641, "grad_norm": 2.455098867416382, "learning_rate": 1.5369156387760866e-05, "loss": 1.8493, "step": 13580 }, { "epoch": 1.930173147885325, "grad_norm": 2.165491819381714, "learning_rate": 1.53565823365354e-05, "loss": 1.8978, "step": 13600 }, { "epoch": 1.933011637808686, "grad_norm": 2.3962185382843018, "learning_rate": 1.5343996397094568e-05, "loss": 1.8799, "step": 13620 }, { "epoch": 1.9358501277320466, "grad_norm": 2.42972993850708, "learning_rate": 1.5331398597371173e-05, "loss": 1.8648, "step": 13640 }, { "epoch": 1.9386886176554072, "grad_norm": 2.2417454719543457, "learning_rate": 1.531878896532434e-05, "loss": 1.8706, "step": 13660 }, { "epoch": 1.9415271075787681, "grad_norm": 2.3768672943115234, "learning_rate": 1.5306167528939457e-05, "loss": 1.8876, "step": 13680 }, { "epoch": 1.944365597502129, "grad_norm": 2.4054510593414307, "learning_rate": 1.5293534316228104e-05, "loss": 1.8923, "step": 13700 }, { "epoch": 1.9472040874254897, "grad_norm": 2.328606605529785, "learning_rate": 1.5280889355228004e-05, "loss": 1.8276, "step": 13720 }, { "epoch": 1.9500425773488503, "grad_norm": 2.424445629119873, "learning_rate": 1.526823267400295e-05, "loss": 1.8756, "step": 13740 }, { "epoch": 1.9528810672722112, "grad_norm": 2.323063611984253, "learning_rate": 1.5255564300642735e-05, "loss": 1.9151, "step": 13760 }, { "epoch": 1.9557195571955721, "grad_norm": 2.2834620475769043, "learning_rate": 1.5242884263263133e-05, "loss": 1.8855, "step": 13780 }, { "epoch": 1.9585580471189328, "grad_norm": 2.2502670288085938, "learning_rate": 1.5230192590005772e-05, "loss": 1.8809, "step": 13800 }, { "epoch": 1.9613965370422934, "grad_norm": 2.4047374725341797, "learning_rate": 1.521748930903812e-05, "loss": 1.8696, "step": 13820 }, { "epoch": 1.964235026965654, "grad_norm": 2.267759084701538, "learning_rate": 1.5204774448553403e-05, "loss": 1.8677, "step": 13840 }, { "epoch": 1.967073516889015, "grad_norm": 2.33536434173584, "learning_rate": 1.5192048036770548e-05, "loss": 1.8507, "step": 13860 }, { "epoch": 1.9699120068123759, "grad_norm": 2.4818344116210938, "learning_rate": 1.5179310101934118e-05, "loss": 1.9419, "step": 13880 }, { "epoch": 1.9727504967357365, "grad_norm": 2.2149715423583984, "learning_rate": 1.5166560672314245e-05, "loss": 1.8694, "step": 13900 }, { "epoch": 1.9755889866590972, "grad_norm": 2.325910806655884, "learning_rate": 1.515379977620658e-05, "loss": 1.9118, "step": 13920 }, { "epoch": 1.978427476582458, "grad_norm": 2.5102972984313965, "learning_rate": 1.5141027441932217e-05, "loss": 1.8627, "step": 13940 }, { "epoch": 1.981265966505819, "grad_norm": 2.2898850440979004, "learning_rate": 1.5128243697837641e-05, "loss": 1.8453, "step": 13960 }, { "epoch": 1.9841044564291797, "grad_norm": 2.428840160369873, "learning_rate": 1.5115448572294654e-05, "loss": 1.8726, "step": 13980 }, { "epoch": 1.9869429463525403, "grad_norm": 2.2928237915039062, "learning_rate": 1.5102642093700314e-05, "loss": 1.8524, "step": 14000 }, { "epoch": 1.9897814362759012, "grad_norm": 2.375397205352783, "learning_rate": 1.5089824290476891e-05, "loss": 1.8663, "step": 14020 }, { "epoch": 1.992619926199262, "grad_norm": 2.3106205463409424, "learning_rate": 1.507699519107177e-05, "loss": 1.8635, "step": 14040 }, { "epoch": 1.9954584161226228, "grad_norm": 2.2063169479370117, "learning_rate": 1.5064154823957415e-05, "loss": 1.8548, "step": 14060 }, { "epoch": 1.9982969060459834, "grad_norm": 2.4230427742004395, "learning_rate": 1.5051303217631303e-05, "loss": 1.886, "step": 14080 }, { "epoch": 2.001135395969344, "grad_norm": 2.3671674728393555, "learning_rate": 1.5038440400615843e-05, "loss": 1.7966, "step": 14100 }, { "epoch": 2.003973885892705, "grad_norm": 2.6389968395233154, "learning_rate": 1.5025566401458336e-05, "loss": 1.6388, "step": 14120 }, { "epoch": 2.006812375816066, "grad_norm": 2.4903147220611572, "learning_rate": 1.5012681248730889e-05, "loss": 1.6246, "step": 14140 }, { "epoch": 2.0096508657394265, "grad_norm": 2.571131706237793, "learning_rate": 1.4999784971030373e-05, "loss": 1.6644, "step": 14160 }, { "epoch": 2.012489355662787, "grad_norm": 2.3591833114624023, "learning_rate": 1.4986877596978344e-05, "loss": 1.6337, "step": 14180 }, { "epoch": 2.0153278455861483, "grad_norm": 2.6393609046936035, "learning_rate": 1.4973959155220985e-05, "loss": 1.6326, "step": 14200 }, { "epoch": 2.018166335509509, "grad_norm": 2.635631799697876, "learning_rate": 1.4961029674429044e-05, "loss": 1.6383, "step": 14220 }, { "epoch": 2.0210048254328696, "grad_norm": 2.62206768989563, "learning_rate": 1.4948089183297767e-05, "loss": 1.6843, "step": 14240 }, { "epoch": 2.0238433153562303, "grad_norm": 2.5939929485321045, "learning_rate": 1.4935137710546837e-05, "loss": 1.6428, "step": 14260 }, { "epoch": 2.0266818052795914, "grad_norm": 2.583174467086792, "learning_rate": 1.4922175284920309e-05, "loss": 1.6058, "step": 14280 }, { "epoch": 2.029520295202952, "grad_norm": 2.9003219604492188, "learning_rate": 1.4909201935186543e-05, "loss": 1.6646, "step": 14300 }, { "epoch": 2.0323587851263127, "grad_norm": 2.623399496078491, "learning_rate": 1.4896217690138156e-05, "loss": 1.6559, "step": 14320 }, { "epoch": 2.0351972750496734, "grad_norm": 2.6853528022766113, "learning_rate": 1.4883222578591927e-05, "loss": 1.629, "step": 14340 }, { "epoch": 2.0380357649730345, "grad_norm": 2.6103992462158203, "learning_rate": 1.4870216629388767e-05, "loss": 1.7055, "step": 14360 }, { "epoch": 2.040874254896395, "grad_norm": 2.570093870162964, "learning_rate": 1.4857199871393633e-05, "loss": 1.6514, "step": 14380 }, { "epoch": 2.043712744819756, "grad_norm": 2.7448740005493164, "learning_rate": 1.4844172333495473e-05, "loss": 1.6428, "step": 14400 }, { "epoch": 2.0465512347431165, "grad_norm": 2.767152786254883, "learning_rate": 1.4831134044607156e-05, "loss": 1.6814, "step": 14420 }, { "epoch": 2.0493897246664776, "grad_norm": 2.803035259246826, "learning_rate": 1.4818085033665416e-05, "loss": 1.6352, "step": 14440 }, { "epoch": 2.0522282145898383, "grad_norm": 2.6689233779907227, "learning_rate": 1.480502532963078e-05, "loss": 1.6178, "step": 14460 }, { "epoch": 2.055066704513199, "grad_norm": 2.7869627475738525, "learning_rate": 1.479195496148751e-05, "loss": 1.6448, "step": 14480 }, { "epoch": 2.0579051944365596, "grad_norm": 2.533984661102295, "learning_rate": 1.4778873958243528e-05, "loss": 1.6705, "step": 14500 }, { "epoch": 2.0607436843599207, "grad_norm": 2.614567756652832, "learning_rate": 1.4765782348930365e-05, "loss": 1.6496, "step": 14520 }, { "epoch": 2.0635821742832814, "grad_norm": 2.770102024078369, "learning_rate": 1.4752680162603098e-05, "loss": 1.6556, "step": 14540 }, { "epoch": 2.066420664206642, "grad_norm": 2.777745008468628, "learning_rate": 1.4739567428340262e-05, "loss": 1.6563, "step": 14560 }, { "epoch": 2.0692591541300027, "grad_norm": 2.700680732727051, "learning_rate": 1.4726444175243814e-05, "loss": 1.6449, "step": 14580 }, { "epoch": 2.072097644053364, "grad_norm": 2.6471896171569824, "learning_rate": 1.4713310432439052e-05, "loss": 1.6193, "step": 14600 }, { "epoch": 2.0749361339767245, "grad_norm": 2.780179738998413, "learning_rate": 1.4700166229074557e-05, "loss": 1.6208, "step": 14620 }, { "epoch": 2.077774623900085, "grad_norm": 2.7406816482543945, "learning_rate": 1.4687011594322125e-05, "loss": 1.6116, "step": 14640 }, { "epoch": 2.080613113823446, "grad_norm": 2.7625088691711426, "learning_rate": 1.4673846557376694e-05, "loss": 1.582, "step": 14660 }, { "epoch": 2.0834516037468065, "grad_norm": 2.448232650756836, "learning_rate": 1.4660671147456302e-05, "loss": 1.6664, "step": 14680 }, { "epoch": 2.0862900936701676, "grad_norm": 2.7308437824249268, "learning_rate": 1.4647485393802001e-05, "loss": 1.6525, "step": 14700 }, { "epoch": 2.0891285835935283, "grad_norm": 2.724123477935791, "learning_rate": 1.4634289325677802e-05, "loss": 1.6552, "step": 14720 }, { "epoch": 2.091967073516889, "grad_norm": 2.670830249786377, "learning_rate": 1.4621082972370607e-05, "loss": 1.6255, "step": 14740 }, { "epoch": 2.0948055634402496, "grad_norm": 2.5322370529174805, "learning_rate": 1.4607866363190138e-05, "loss": 1.6605, "step": 14760 }, { "epoch": 2.0976440533636107, "grad_norm": 2.7693140506744385, "learning_rate": 1.4594639527468893e-05, "loss": 1.659, "step": 14780 }, { "epoch": 2.1004825432869714, "grad_norm": 2.5515897274017334, "learning_rate": 1.4581402494562056e-05, "loss": 1.6442, "step": 14800 }, { "epoch": 2.103321033210332, "grad_norm": 2.6275620460510254, "learning_rate": 1.4568155293847443e-05, "loss": 1.6182, "step": 14820 }, { "epoch": 2.1061595231336927, "grad_norm": 2.7141737937927246, "learning_rate": 1.4554897954725438e-05, "loss": 1.6054, "step": 14840 }, { "epoch": 2.108998013057054, "grad_norm": 2.591804265975952, "learning_rate": 1.4541630506618924e-05, "loss": 1.671, "step": 14860 }, { "epoch": 2.1118365029804145, "grad_norm": 2.657256841659546, "learning_rate": 1.4528352978973223e-05, "loss": 1.633, "step": 14880 }, { "epoch": 2.114674992903775, "grad_norm": 2.7163732051849365, "learning_rate": 1.4515065401256026e-05, "loss": 1.6485, "step": 14900 }, { "epoch": 2.117513482827136, "grad_norm": 2.806790590286255, "learning_rate": 1.450176780295732e-05, "loss": 1.6546, "step": 14920 }, { "epoch": 2.120351972750497, "grad_norm": 2.5761098861694336, "learning_rate": 1.4488460213589352e-05, "loss": 1.582, "step": 14940 }, { "epoch": 2.1231904626738576, "grad_norm": 2.9504895210266113, "learning_rate": 1.447514266268652e-05, "loss": 1.6547, "step": 14960 }, { "epoch": 2.1260289525972182, "grad_norm": 2.762228012084961, "learning_rate": 1.4461815179805347e-05, "loss": 1.644, "step": 14980 }, { "epoch": 2.128867442520579, "grad_norm": 2.949052333831787, "learning_rate": 1.4448477794524385e-05, "loss": 1.6522, "step": 15000 }, { "epoch": 2.13170593244394, "grad_norm": 2.6244938373565674, "learning_rate": 1.4435130536444175e-05, "loss": 1.6575, "step": 15020 }, { "epoch": 2.1345444223673007, "grad_norm": 2.6246156692504883, "learning_rate": 1.4421773435187164e-05, "loss": 1.6648, "step": 15040 }, { "epoch": 2.1373829122906614, "grad_norm": 2.6142659187316895, "learning_rate": 1.4408406520397641e-05, "loss": 1.6372, "step": 15060 }, { "epoch": 2.140221402214022, "grad_norm": 2.6811931133270264, "learning_rate": 1.4395029821741687e-05, "loss": 1.6749, "step": 15080 }, { "epoch": 2.1430598921373827, "grad_norm": 2.7237887382507324, "learning_rate": 1.4381643368907081e-05, "loss": 1.6, "step": 15100 }, { "epoch": 2.145898382060744, "grad_norm": 2.821321487426758, "learning_rate": 1.4368247191603262e-05, "loss": 1.6523, "step": 15120 }, { "epoch": 2.1487368719841045, "grad_norm": 2.685486078262329, "learning_rate": 1.4354841319561244e-05, "loss": 1.6454, "step": 15140 }, { "epoch": 2.151575361907465, "grad_norm": 2.9063682556152344, "learning_rate": 1.4341425782533565e-05, "loss": 1.6397, "step": 15160 }, { "epoch": 2.1544138518308262, "grad_norm": 2.7164576053619385, "learning_rate": 1.4328000610294202e-05, "loss": 1.6664, "step": 15180 }, { "epoch": 2.157252341754187, "grad_norm": 2.6676976680755615, "learning_rate": 1.4314565832638528e-05, "loss": 1.6284, "step": 15200 }, { "epoch": 2.1600908316775476, "grad_norm": 2.733680486679077, "learning_rate": 1.4301121479383225e-05, "loss": 1.6609, "step": 15220 }, { "epoch": 2.1629293216009082, "grad_norm": 2.7849810123443604, "learning_rate": 1.4287667580366233e-05, "loss": 1.6871, "step": 15240 }, { "epoch": 2.165767811524269, "grad_norm": 2.724754571914673, "learning_rate": 1.4274204165446672e-05, "loss": 1.6673, "step": 15260 }, { "epoch": 2.16860630144763, "grad_norm": 2.828763008117676, "learning_rate": 1.4260731264504786e-05, "loss": 1.698, "step": 15280 }, { "epoch": 2.1714447913709907, "grad_norm": 2.7576427459716797, "learning_rate": 1.424724890744187e-05, "loss": 1.6524, "step": 15300 }, { "epoch": 2.1742832812943513, "grad_norm": 2.7064642906188965, "learning_rate": 1.42337571241802e-05, "loss": 1.6467, "step": 15320 }, { "epoch": 2.177121771217712, "grad_norm": 2.618053913116455, "learning_rate": 1.4220255944662987e-05, "loss": 1.637, "step": 15340 }, { "epoch": 2.179960261141073, "grad_norm": 2.7968339920043945, "learning_rate": 1.4206745398854278e-05, "loss": 1.6706, "step": 15360 }, { "epoch": 2.1827987510644338, "grad_norm": 2.7265772819519043, "learning_rate": 1.4193225516738917e-05, "loss": 1.6671, "step": 15380 }, { "epoch": 2.1856372409877944, "grad_norm": 2.770986557006836, "learning_rate": 1.4179696328322474e-05, "loss": 1.6491, "step": 15400 }, { "epoch": 2.188475730911155, "grad_norm": 2.7630534172058105, "learning_rate": 1.4166157863631158e-05, "loss": 1.6897, "step": 15420 }, { "epoch": 2.191314220834516, "grad_norm": 2.710226058959961, "learning_rate": 1.4152610152711781e-05, "loss": 1.705, "step": 15440 }, { "epoch": 2.194152710757877, "grad_norm": 2.6159048080444336, "learning_rate": 1.4139053225631662e-05, "loss": 1.6274, "step": 15460 }, { "epoch": 2.1969912006812375, "grad_norm": 2.711472988128662, "learning_rate": 1.4125487112478584e-05, "loss": 1.6773, "step": 15480 }, { "epoch": 2.199829690604598, "grad_norm": 2.6465487480163574, "learning_rate": 1.4111911843360717e-05, "loss": 1.6025, "step": 15500 }, { "epoch": 2.2026681805279593, "grad_norm": 2.7612884044647217, "learning_rate": 1.4098327448406544e-05, "loss": 1.6785, "step": 15520 }, { "epoch": 2.20550667045132, "grad_norm": 2.931784152984619, "learning_rate": 1.4084733957764802e-05, "loss": 1.6562, "step": 15540 }, { "epoch": 2.2083451603746806, "grad_norm": 2.6920652389526367, "learning_rate": 1.4071131401604427e-05, "loss": 1.6974, "step": 15560 }, { "epoch": 2.2111836502980413, "grad_norm": 2.770467519760132, "learning_rate": 1.4057519810114461e-05, "loss": 1.6561, "step": 15580 }, { "epoch": 2.2140221402214024, "grad_norm": 2.8092517852783203, "learning_rate": 1.4043899213504004e-05, "loss": 1.638, "step": 15600 }, { "epoch": 2.216860630144763, "grad_norm": 2.4220733642578125, "learning_rate": 1.4030269642002146e-05, "loss": 1.5947, "step": 15620 }, { "epoch": 2.2196991200681238, "grad_norm": 2.768281936645508, "learning_rate": 1.4016631125857887e-05, "loss": 1.6904, "step": 15640 }, { "epoch": 2.2225376099914844, "grad_norm": 2.6609528064727783, "learning_rate": 1.4002983695340085e-05, "loss": 1.6867, "step": 15660 }, { "epoch": 2.225376099914845, "grad_norm": 2.676619052886963, "learning_rate": 1.3989327380737377e-05, "loss": 1.6718, "step": 15680 }, { "epoch": 2.228214589838206, "grad_norm": 2.612619400024414, "learning_rate": 1.3975662212358123e-05, "loss": 1.6203, "step": 15700 }, { "epoch": 2.231053079761567, "grad_norm": 2.616924285888672, "learning_rate": 1.3961988220530327e-05, "loss": 1.6733, "step": 15720 }, { "epoch": 2.2338915696849275, "grad_norm": 2.693221092224121, "learning_rate": 1.394830543560158e-05, "loss": 1.6704, "step": 15740 }, { "epoch": 2.236730059608288, "grad_norm": 2.8380417823791504, "learning_rate": 1.3934613887938981e-05, "loss": 1.6112, "step": 15760 }, { "epoch": 2.2395685495316493, "grad_norm": 2.8150603771209717, "learning_rate": 1.3920913607929092e-05, "loss": 1.6373, "step": 15780 }, { "epoch": 2.24240703945501, "grad_norm": 2.806086301803589, "learning_rate": 1.3907204625977837e-05, "loss": 1.6553, "step": 15800 }, { "epoch": 2.2452455293783706, "grad_norm": 2.7372372150421143, "learning_rate": 1.3893486972510463e-05, "loss": 1.6344, "step": 15820 }, { "epoch": 2.2480840193017313, "grad_norm": 2.7969744205474854, "learning_rate": 1.387976067797146e-05, "loss": 1.6715, "step": 15840 }, { "epoch": 2.2509225092250924, "grad_norm": 2.767852306365967, "learning_rate": 1.3866025772824498e-05, "loss": 1.6482, "step": 15860 }, { "epoch": 2.253760999148453, "grad_norm": 2.8310275077819824, "learning_rate": 1.3852282287552356e-05, "loss": 1.663, "step": 15880 }, { "epoch": 2.2565994890718137, "grad_norm": 2.5795023441314697, "learning_rate": 1.383853025265685e-05, "loss": 1.6334, "step": 15900 }, { "epoch": 2.2594379789951744, "grad_norm": 2.841590642929077, "learning_rate": 1.382476969865878e-05, "loss": 1.6841, "step": 15920 }, { "epoch": 2.2622764689185355, "grad_norm": 2.7017767429351807, "learning_rate": 1.3811000656097844e-05, "loss": 1.6667, "step": 15940 }, { "epoch": 2.265114958841896, "grad_norm": 2.8952524662017822, "learning_rate": 1.3797223155532585e-05, "loss": 1.6533, "step": 15960 }, { "epoch": 2.267953448765257, "grad_norm": 2.591139078140259, "learning_rate": 1.3783437227540314e-05, "loss": 1.6176, "step": 15980 }, { "epoch": 2.2707919386886175, "grad_norm": 2.6861958503723145, "learning_rate": 1.376964290271705e-05, "loss": 1.6615, "step": 16000 }, { "epoch": 2.2736304286119786, "grad_norm": 2.747091293334961, "learning_rate": 1.3755840211677444e-05, "loss": 1.6595, "step": 16020 }, { "epoch": 2.2764689185353393, "grad_norm": 2.6577868461608887, "learning_rate": 1.374202918505472e-05, "loss": 1.6756, "step": 16040 }, { "epoch": 2.2793074084587, "grad_norm": 2.60575008392334, "learning_rate": 1.3728209853500589e-05, "loss": 1.679, "step": 16060 }, { "epoch": 2.2821458983820606, "grad_norm": 2.7935256958007812, "learning_rate": 1.3714382247685208e-05, "loss": 1.6609, "step": 16080 }, { "epoch": 2.2849843883054213, "grad_norm": 2.6807548999786377, "learning_rate": 1.3700546398297092e-05, "loss": 1.6261, "step": 16100 }, { "epoch": 2.2878228782287824, "grad_norm": 2.8889832496643066, "learning_rate": 1.368670233604305e-05, "loss": 1.6669, "step": 16120 }, { "epoch": 2.290661368152143, "grad_norm": 2.7474524974823, "learning_rate": 1.3672850091648117e-05, "loss": 1.6097, "step": 16140 }, { "epoch": 2.2934998580755037, "grad_norm": 2.6257283687591553, "learning_rate": 1.3658989695855494e-05, "loss": 1.6281, "step": 16160 }, { "epoch": 2.296338347998865, "grad_norm": 2.704493761062622, "learning_rate": 1.3645121179426466e-05, "loss": 1.6157, "step": 16180 }, { "epoch": 2.2991768379222255, "grad_norm": 2.593991756439209, "learning_rate": 1.3631244573140344e-05, "loss": 1.6249, "step": 16200 }, { "epoch": 2.302015327845586, "grad_norm": 2.7354273796081543, "learning_rate": 1.361735990779439e-05, "loss": 1.6884, "step": 16220 }, { "epoch": 2.304853817768947, "grad_norm": 2.779437780380249, "learning_rate": 1.3603467214203759e-05, "loss": 1.6918, "step": 16240 }, { "epoch": 2.3076923076923075, "grad_norm": 2.768787384033203, "learning_rate": 1.3589566523201418e-05, "loss": 1.6733, "step": 16260 }, { "epoch": 2.3105307976156686, "grad_norm": 2.833940029144287, "learning_rate": 1.3575657865638085e-05, "loss": 1.6962, "step": 16280 }, { "epoch": 2.3133692875390293, "grad_norm": 2.808438777923584, "learning_rate": 1.3561741272382157e-05, "loss": 1.7031, "step": 16300 }, { "epoch": 2.31620777746239, "grad_norm": 2.6637308597564697, "learning_rate": 1.3547816774319644e-05, "loss": 1.7034, "step": 16320 }, { "epoch": 2.3190462673857506, "grad_norm": 2.629077434539795, "learning_rate": 1.3533884402354103e-05, "loss": 1.6133, "step": 16340 }, { "epoch": 2.3218847573091117, "grad_norm": 2.665250062942505, "learning_rate": 1.3519944187406559e-05, "loss": 1.6406, "step": 16360 }, { "epoch": 2.3247232472324724, "grad_norm": 2.6717216968536377, "learning_rate": 1.3505996160415448e-05, "loss": 1.6391, "step": 16380 }, { "epoch": 2.327561737155833, "grad_norm": 2.765874147415161, "learning_rate": 1.3492040352336548e-05, "loss": 1.6139, "step": 16400 }, { "epoch": 2.3304002270791937, "grad_norm": 2.879499912261963, "learning_rate": 1.3478076794142899e-05, "loss": 1.6454, "step": 16420 }, { "epoch": 2.333238717002555, "grad_norm": 2.7924551963806152, "learning_rate": 1.3464105516824741e-05, "loss": 1.7004, "step": 16440 }, { "epoch": 2.3360772069259155, "grad_norm": 2.7460954189300537, "learning_rate": 1.345012655138945e-05, "loss": 1.6843, "step": 16460 }, { "epoch": 2.338915696849276, "grad_norm": 2.7618355751037598, "learning_rate": 1.3436139928861463e-05, "loss": 1.6952, "step": 16480 }, { "epoch": 2.341754186772637, "grad_norm": 2.792635440826416, "learning_rate": 1.3422145680282216e-05, "loss": 1.6227, "step": 16500 }, { "epoch": 2.344592676695998, "grad_norm": 2.626446008682251, "learning_rate": 1.3408143836710057e-05, "loss": 1.6799, "step": 16520 }, { "epoch": 2.3474311666193586, "grad_norm": 2.679884433746338, "learning_rate": 1.33941344292202e-05, "loss": 1.6307, "step": 16540 }, { "epoch": 2.3502696565427192, "grad_norm": 2.6958634853363037, "learning_rate": 1.338011748890465e-05, "loss": 1.6736, "step": 16560 }, { "epoch": 2.35310814646608, "grad_norm": 2.7609002590179443, "learning_rate": 1.3366093046872118e-05, "loss": 1.6472, "step": 16580 }, { "epoch": 2.355946636389441, "grad_norm": 2.6660282611846924, "learning_rate": 1.3352061134247968e-05, "loss": 1.6447, "step": 16600 }, { "epoch": 2.3587851263128017, "grad_norm": 2.7295422554016113, "learning_rate": 1.3338021782174155e-05, "loss": 1.692, "step": 16620 }, { "epoch": 2.3616236162361623, "grad_norm": 2.816382646560669, "learning_rate": 1.3323975021809133e-05, "loss": 1.5963, "step": 16640 }, { "epoch": 2.364462106159523, "grad_norm": 2.7913992404937744, "learning_rate": 1.3309920884327797e-05, "loss": 1.6671, "step": 16660 }, { "epoch": 2.3673005960828837, "grad_norm": 2.6726088523864746, "learning_rate": 1.3295859400921418e-05, "loss": 1.6385, "step": 16680 }, { "epoch": 2.370139086006245, "grad_norm": 2.672245502471924, "learning_rate": 1.3281790602797578e-05, "loss": 1.6758, "step": 16700 }, { "epoch": 2.3729775759296055, "grad_norm": 2.7192418575286865, "learning_rate": 1.3267714521180077e-05, "loss": 1.6544, "step": 16720 }, { "epoch": 2.375816065852966, "grad_norm": 2.67012357711792, "learning_rate": 1.3253631187308894e-05, "loss": 1.711, "step": 16740 }, { "epoch": 2.3786545557763272, "grad_norm": 2.662221670150757, "learning_rate": 1.32395406324401e-05, "loss": 1.6152, "step": 16760 }, { "epoch": 2.381493045699688, "grad_norm": 2.7786319255828857, "learning_rate": 1.3225442887845783e-05, "loss": 1.6392, "step": 16780 }, { "epoch": 2.3843315356230486, "grad_norm": 2.6753108501434326, "learning_rate": 1.3211337984814e-05, "loss": 1.6605, "step": 16800 }, { "epoch": 2.3871700255464092, "grad_norm": 2.7844271659851074, "learning_rate": 1.3197225954648683e-05, "loss": 1.6637, "step": 16820 }, { "epoch": 2.39000851546977, "grad_norm": 2.764630079269409, "learning_rate": 1.3183106828669594e-05, "loss": 1.6753, "step": 16840 }, { "epoch": 2.392847005393131, "grad_norm": 2.703876495361328, "learning_rate": 1.3168980638212237e-05, "loss": 1.6709, "step": 16860 }, { "epoch": 2.3956854953164917, "grad_norm": 2.5503478050231934, "learning_rate": 1.3154847414627795e-05, "loss": 1.6988, "step": 16880 }, { "epoch": 2.3985239852398523, "grad_norm": 2.683335781097412, "learning_rate": 1.314070718928306e-05, "loss": 1.6743, "step": 16900 }, { "epoch": 2.401362475163213, "grad_norm": 2.698822259902954, "learning_rate": 1.3126559993560362e-05, "loss": 1.6754, "step": 16920 }, { "epoch": 2.404200965086574, "grad_norm": 2.6747167110443115, "learning_rate": 1.3112405858857504e-05, "loss": 1.692, "step": 16940 }, { "epoch": 2.4070394550099348, "grad_norm": 2.7661619186401367, "learning_rate": 1.3098244816587691e-05, "loss": 1.6409, "step": 16960 }, { "epoch": 2.4098779449332954, "grad_norm": 2.842583656311035, "learning_rate": 1.308407689817945e-05, "loss": 1.6494, "step": 16980 }, { "epoch": 2.412716434856656, "grad_norm": 2.7270514965057373, "learning_rate": 1.3069902135076578e-05, "loss": 1.6728, "step": 17000 }, { "epoch": 2.415554924780017, "grad_norm": 2.6723341941833496, "learning_rate": 1.3055720558738056e-05, "loss": 1.6733, "step": 17020 }, { "epoch": 2.418393414703378, "grad_norm": 2.9135372638702393, "learning_rate": 1.304153220063799e-05, "loss": 1.6274, "step": 17040 }, { "epoch": 2.4212319046267385, "grad_norm": 2.7837767601013184, "learning_rate": 1.3027337092265535e-05, "loss": 1.6426, "step": 17060 }, { "epoch": 2.424070394550099, "grad_norm": 2.7899529933929443, "learning_rate": 1.3013135265124826e-05, "loss": 1.6377, "step": 17080 }, { "epoch": 2.42690888447346, "grad_norm": 2.7650821208953857, "learning_rate": 1.2998926750734916e-05, "loss": 1.6672, "step": 17100 }, { "epoch": 2.429747374396821, "grad_norm": 2.6678178310394287, "learning_rate": 1.2984711580629696e-05, "loss": 1.6676, "step": 17120 }, { "epoch": 2.4325858643201816, "grad_norm": 2.9348485469818115, "learning_rate": 1.2970489786357825e-05, "loss": 1.6558, "step": 17140 }, { "epoch": 2.4354243542435423, "grad_norm": 2.7328803539276123, "learning_rate": 1.2956261399482664e-05, "loss": 1.6798, "step": 17160 }, { "epoch": 2.4382628441669034, "grad_norm": 2.7672085762023926, "learning_rate": 1.2942026451582213e-05, "loss": 1.6491, "step": 17180 }, { "epoch": 2.441101334090264, "grad_norm": 2.7562270164489746, "learning_rate": 1.2927784974249027e-05, "loss": 1.7024, "step": 17200 }, { "epoch": 2.4439398240136248, "grad_norm": 2.7690000534057617, "learning_rate": 1.2913536999090143e-05, "loss": 1.6725, "step": 17220 }, { "epoch": 2.4467783139369854, "grad_norm": 2.681001901626587, "learning_rate": 1.2899282557727043e-05, "loss": 1.6196, "step": 17240 }, { "epoch": 2.449616803860346, "grad_norm": 2.865332841873169, "learning_rate": 1.2885021681795537e-05, "loss": 1.6745, "step": 17260 }, { "epoch": 2.452455293783707, "grad_norm": 2.863779067993164, "learning_rate": 1.2870754402945731e-05, "loss": 1.6751, "step": 17280 }, { "epoch": 2.455293783707068, "grad_norm": 2.8962154388427734, "learning_rate": 1.2856480752841926e-05, "loss": 1.6499, "step": 17300 }, { "epoch": 2.4581322736304285, "grad_norm": 2.752647876739502, "learning_rate": 1.284220076316258e-05, "loss": 1.6868, "step": 17320 }, { "epoch": 2.4609707635537896, "grad_norm": 2.6083104610443115, "learning_rate": 1.282791446560021e-05, "loss": 1.6868, "step": 17340 }, { "epoch": 2.4638092534771503, "grad_norm": 2.6952102184295654, "learning_rate": 1.2813621891861339e-05, "loss": 1.6425, "step": 17360 }, { "epoch": 2.466647743400511, "grad_norm": 2.7221739292144775, "learning_rate": 1.279932307366641e-05, "loss": 1.6557, "step": 17380 }, { "epoch": 2.4694862333238716, "grad_norm": 2.544970989227295, "learning_rate": 1.2785018042749732e-05, "loss": 1.6677, "step": 17400 }, { "epoch": 2.4723247232472323, "grad_norm": 2.6459202766418457, "learning_rate": 1.2770706830859403e-05, "loss": 1.6544, "step": 17420 }, { "epoch": 2.4751632131705934, "grad_norm": 2.6648192405700684, "learning_rate": 1.2756389469757236e-05, "loss": 1.7131, "step": 17440 }, { "epoch": 2.478001703093954, "grad_norm": 2.7102556228637695, "learning_rate": 1.2742065991218686e-05, "loss": 1.6919, "step": 17460 }, { "epoch": 2.4808401930173147, "grad_norm": 2.649365186691284, "learning_rate": 1.2727736427032799e-05, "loss": 1.7005, "step": 17480 }, { "epoch": 2.4836786829406754, "grad_norm": 2.7724220752716064, "learning_rate": 1.2713400809002116e-05, "loss": 1.6491, "step": 17500 }, { "epoch": 2.4865171728640365, "grad_norm": 2.720832347869873, "learning_rate": 1.2699059168942615e-05, "loss": 1.6542, "step": 17520 }, { "epoch": 2.489355662787397, "grad_norm": 2.8054747581481934, "learning_rate": 1.2684711538683641e-05, "loss": 1.6893, "step": 17540 }, { "epoch": 2.492194152710758, "grad_norm": 2.6879336833953857, "learning_rate": 1.2670357950067835e-05, "loss": 1.656, "step": 17560 }, { "epoch": 2.4950326426341185, "grad_norm": 2.686126708984375, "learning_rate": 1.265599843495106e-05, "loss": 1.6788, "step": 17580 }, { "epoch": 2.4978711325574796, "grad_norm": 2.80765438079834, "learning_rate": 1.2641633025202328e-05, "loss": 1.6449, "step": 17600 }, { "epoch": 2.5007096224808403, "grad_norm": 2.720756769180298, "learning_rate": 1.2627261752703745e-05, "loss": 1.6932, "step": 17620 }, { "epoch": 2.503548112404201, "grad_norm": 2.7270922660827637, "learning_rate": 1.2612884649350415e-05, "loss": 1.6788, "step": 17640 }, { "epoch": 2.5063866023275616, "grad_norm": 2.74861216545105, "learning_rate": 1.2598501747050387e-05, "loss": 1.6696, "step": 17660 }, { "epoch": 2.5092250922509223, "grad_norm": 2.5770909786224365, "learning_rate": 1.2584113077724584e-05, "loss": 1.6869, "step": 17680 }, { "epoch": 2.5120635821742834, "grad_norm": 2.7025647163391113, "learning_rate": 1.256971867330673e-05, "loss": 1.6533, "step": 17700 }, { "epoch": 2.514902072097644, "grad_norm": 2.7311036586761475, "learning_rate": 1.2555318565743269e-05, "loss": 1.6816, "step": 17720 }, { "epoch": 2.5177405620210047, "grad_norm": 2.8846051692962646, "learning_rate": 1.2540912786993307e-05, "loss": 1.7042, "step": 17740 }, { "epoch": 2.520579051944366, "grad_norm": 2.7339835166931152, "learning_rate": 1.2526501369028534e-05, "loss": 1.6967, "step": 17760 }, { "epoch": 2.5234175418677265, "grad_norm": 2.7246944904327393, "learning_rate": 1.2512084343833162e-05, "loss": 1.7109, "step": 17780 }, { "epoch": 2.526256031791087, "grad_norm": 2.6940948963165283, "learning_rate": 1.2497661743403839e-05, "loss": 1.6797, "step": 17800 }, { "epoch": 2.529094521714448, "grad_norm": 2.7714884281158447, "learning_rate": 1.2483233599749594e-05, "loss": 1.6832, "step": 17820 }, { "epoch": 2.5319330116378085, "grad_norm": 2.71606707572937, "learning_rate": 1.2468799944891752e-05, "loss": 1.6519, "step": 17840 }, { "epoch": 2.5347715015611696, "grad_norm": 2.5598199367523193, "learning_rate": 1.2454360810863872e-05, "loss": 1.7208, "step": 17860 }, { "epoch": 2.5376099914845303, "grad_norm": 2.5094001293182373, "learning_rate": 1.2439916229711674e-05, "loss": 1.6633, "step": 17880 }, { "epoch": 2.540448481407891, "grad_norm": 2.7318613529205322, "learning_rate": 1.2425466233492969e-05, "loss": 1.7049, "step": 17900 }, { "epoch": 2.543286971331252, "grad_norm": 2.7886569499969482, "learning_rate": 1.2411010854277579e-05, "loss": 1.692, "step": 17920 }, { "epoch": 2.5461254612546127, "grad_norm": 2.7421681880950928, "learning_rate": 1.2396550124147282e-05, "loss": 1.6628, "step": 17940 }, { "epoch": 2.5489639511779734, "grad_norm": 2.7705025672912598, "learning_rate": 1.2382084075195727e-05, "loss": 1.6864, "step": 17960 }, { "epoch": 2.551802441101334, "grad_norm": 2.806274890899658, "learning_rate": 1.2367612739528364e-05, "loss": 1.6198, "step": 17980 }, { "epoch": 2.5546409310246947, "grad_norm": 2.497346878051758, "learning_rate": 1.235313614926238e-05, "loss": 1.6729, "step": 18000 }, { "epoch": 2.557479420948056, "grad_norm": 2.7188079357147217, "learning_rate": 1.2338654336526625e-05, "loss": 1.615, "step": 18020 }, { "epoch": 2.5603179108714165, "grad_norm": 2.7530691623687744, "learning_rate": 1.2324167333461541e-05, "loss": 1.6344, "step": 18040 }, { "epoch": 2.563156400794777, "grad_norm": 2.5454366207122803, "learning_rate": 1.2309675172219086e-05, "loss": 1.6511, "step": 18060 }, { "epoch": 2.565994890718138, "grad_norm": 2.694385290145874, "learning_rate": 1.2295177884962658e-05, "loss": 1.7381, "step": 18080 }, { "epoch": 2.5688333806414985, "grad_norm": 2.6957809925079346, "learning_rate": 1.2280675503867051e-05, "loss": 1.6676, "step": 18100 }, { "epoch": 2.5716718705648596, "grad_norm": 2.6730422973632812, "learning_rate": 1.2266168061118347e-05, "loss": 1.6959, "step": 18120 }, { "epoch": 2.5745103604882202, "grad_norm": 2.7614011764526367, "learning_rate": 1.2251655588913867e-05, "loss": 1.6507, "step": 18140 }, { "epoch": 2.577348850411581, "grad_norm": 2.8465051651000977, "learning_rate": 1.2237138119462095e-05, "loss": 1.7159, "step": 18160 }, { "epoch": 2.580187340334942, "grad_norm": 2.82724666595459, "learning_rate": 1.2222615684982607e-05, "loss": 1.6585, "step": 18180 }, { "epoch": 2.5830258302583027, "grad_norm": 2.6352789402008057, "learning_rate": 1.2208088317705997e-05, "loss": 1.6121, "step": 18200 }, { "epoch": 2.5858643201816633, "grad_norm": 2.651153087615967, "learning_rate": 1.2193556049873802e-05, "loss": 1.6955, "step": 18220 }, { "epoch": 2.588702810105024, "grad_norm": 2.6736204624176025, "learning_rate": 1.217901891373845e-05, "loss": 1.6723, "step": 18240 }, { "epoch": 2.5915413000283847, "grad_norm": 2.6160194873809814, "learning_rate": 1.2164476941563152e-05, "loss": 1.6876, "step": 18260 }, { "epoch": 2.594379789951746, "grad_norm": 2.6227149963378906, "learning_rate": 1.2149930165621867e-05, "loss": 1.6796, "step": 18280 }, { "epoch": 2.5972182798751065, "grad_norm": 2.688061475753784, "learning_rate": 1.213537861819921e-05, "loss": 1.6853, "step": 18300 }, { "epoch": 2.600056769798467, "grad_norm": 2.710500717163086, "learning_rate": 1.212082233159039e-05, "loss": 1.6534, "step": 18320 }, { "epoch": 2.6028952597218282, "grad_norm": 2.7508792877197266, "learning_rate": 1.2106261338101129e-05, "loss": 1.6734, "step": 18340 }, { "epoch": 2.605733749645189, "grad_norm": 2.598700523376465, "learning_rate": 1.2091695670047596e-05, "loss": 1.7021, "step": 18360 }, { "epoch": 2.6085722395685496, "grad_norm": 2.6375434398651123, "learning_rate": 1.2077125359756336e-05, "loss": 1.7254, "step": 18380 }, { "epoch": 2.61141072949191, "grad_norm": 2.779728412628174, "learning_rate": 1.20625504395642e-05, "loss": 1.7039, "step": 18400 }, { "epoch": 2.614249219415271, "grad_norm": 2.704413414001465, "learning_rate": 1.2047970941818264e-05, "loss": 1.662, "step": 18420 }, { "epoch": 2.617087709338632, "grad_norm": 2.6956117153167725, "learning_rate": 1.2033386898875766e-05, "loss": 1.6278, "step": 18440 }, { "epoch": 2.6199261992619927, "grad_norm": 2.8488147258758545, "learning_rate": 1.2018798343104033e-05, "loss": 1.686, "step": 18460 }, { "epoch": 2.6227646891853533, "grad_norm": 2.81943678855896, "learning_rate": 1.20042053068804e-05, "loss": 1.6467, "step": 18480 }, { "epoch": 2.6256031791087144, "grad_norm": 2.581073760986328, "learning_rate": 1.1989607822592163e-05, "loss": 1.6458, "step": 18500 }, { "epoch": 2.628441669032075, "grad_norm": 2.8560519218444824, "learning_rate": 1.1975005922636467e-05, "loss": 1.67, "step": 18520 }, { "epoch": 2.6312801589554358, "grad_norm": 2.6728341579437256, "learning_rate": 1.1960399639420273e-05, "loss": 1.6601, "step": 18540 }, { "epoch": 2.6341186488787964, "grad_norm": 2.7220418453216553, "learning_rate": 1.1945789005360268e-05, "loss": 1.6482, "step": 18560 }, { "epoch": 2.636957138802157, "grad_norm": 2.7361340522766113, "learning_rate": 1.1931174052882792e-05, "loss": 1.6676, "step": 18580 }, { "epoch": 2.639795628725518, "grad_norm": 2.8278684616088867, "learning_rate": 1.1916554814423769e-05, "loss": 1.6771, "step": 18600 }, { "epoch": 2.642634118648879, "grad_norm": 2.7515361309051514, "learning_rate": 1.1901931322428633e-05, "loss": 1.6565, "step": 18620 }, { "epoch": 2.6454726085722395, "grad_norm": 2.8489291667938232, "learning_rate": 1.1887303609352267e-05, "loss": 1.6639, "step": 18640 }, { "epoch": 2.6483110984956, "grad_norm": 2.8557803630828857, "learning_rate": 1.1872671707658914e-05, "loss": 1.674, "step": 18660 }, { "epoch": 2.651149588418961, "grad_norm": 2.670365571975708, "learning_rate": 1.1858035649822115e-05, "loss": 1.6615, "step": 18680 }, { "epoch": 2.653988078342322, "grad_norm": 2.711594343185425, "learning_rate": 1.1843395468324637e-05, "loss": 1.6684, "step": 18700 }, { "epoch": 2.6568265682656826, "grad_norm": 2.7269127368927, "learning_rate": 1.1828751195658394e-05, "loss": 1.6716, "step": 18720 }, { "epoch": 2.6596650581890433, "grad_norm": 2.744840383529663, "learning_rate": 1.1814102864324386e-05, "loss": 1.6962, "step": 18740 }, { "epoch": 2.6625035481124044, "grad_norm": 2.8173303604125977, "learning_rate": 1.1799450506832614e-05, "loss": 1.6419, "step": 18760 }, { "epoch": 2.665342038035765, "grad_norm": 2.682537317276001, "learning_rate": 1.1784794155702026e-05, "loss": 1.6658, "step": 18780 }, { "epoch": 2.6681805279591257, "grad_norm": 2.7286911010742188, "learning_rate": 1.1770133843460418e-05, "loss": 1.6514, "step": 18800 }, { "epoch": 2.6710190178824864, "grad_norm": 2.6757283210754395, "learning_rate": 1.1755469602644392e-05, "loss": 1.6395, "step": 18820 }, { "epoch": 2.673857507805847, "grad_norm": 2.8245978355407715, "learning_rate": 1.1740801465799253e-05, "loss": 1.6483, "step": 18840 }, { "epoch": 2.676695997729208, "grad_norm": 2.7346537113189697, "learning_rate": 1.1726129465478967e-05, "loss": 1.6973, "step": 18860 }, { "epoch": 2.679534487652569, "grad_norm": 2.5856940746307373, "learning_rate": 1.1711453634246071e-05, "loss": 1.6613, "step": 18880 }, { "epoch": 2.6823729775759295, "grad_norm": 2.716831922531128, "learning_rate": 1.1696774004671599e-05, "loss": 1.6802, "step": 18900 }, { "epoch": 2.6852114674992906, "grad_norm": 2.6702849864959717, "learning_rate": 1.1682090609335015e-05, "loss": 1.6751, "step": 18920 }, { "epoch": 2.6880499574226513, "grad_norm": 2.8055613040924072, "learning_rate": 1.1667403480824149e-05, "loss": 1.6648, "step": 18940 }, { "epoch": 2.690888447346012, "grad_norm": 2.694770097732544, "learning_rate": 1.1652712651735109e-05, "loss": 1.6355, "step": 18960 }, { "epoch": 2.6937269372693726, "grad_norm": 2.7334213256835938, "learning_rate": 1.163801815467222e-05, "loss": 1.6856, "step": 18980 }, { "epoch": 2.6965654271927333, "grad_norm": 2.7657744884490967, "learning_rate": 1.162332002224794e-05, "loss": 1.6821, "step": 19000 }, { "epoch": 2.6994039171160944, "grad_norm": 2.7348475456237793, "learning_rate": 1.1608618287082805e-05, "loss": 1.6591, "step": 19020 }, { "epoch": 2.702242407039455, "grad_norm": 2.6721115112304688, "learning_rate": 1.1593912981805344e-05, "loss": 1.6637, "step": 19040 }, { "epoch": 2.7050808969628157, "grad_norm": 2.645540714263916, "learning_rate": 1.1579204139052003e-05, "loss": 1.6956, "step": 19060 }, { "epoch": 2.7079193868861764, "grad_norm": 2.7195353507995605, "learning_rate": 1.1564491791467086e-05, "loss": 1.6555, "step": 19080 }, { "epoch": 2.710757876809537, "grad_norm": 2.6154191493988037, "learning_rate": 1.1549775971702677e-05, "loss": 1.6188, "step": 19100 }, { "epoch": 2.713596366732898, "grad_norm": 2.703265905380249, "learning_rate": 1.1535056712418555e-05, "loss": 1.6745, "step": 19120 }, { "epoch": 2.716434856656259, "grad_norm": 2.82840633392334, "learning_rate": 1.1520334046282146e-05, "loss": 1.6463, "step": 19140 }, { "epoch": 2.7192733465796195, "grad_norm": 2.8584494590759277, "learning_rate": 1.1505608005968427e-05, "loss": 1.6725, "step": 19160 }, { "epoch": 2.7221118365029806, "grad_norm": 2.651611089706421, "learning_rate": 1.149087862415987e-05, "loss": 1.6686, "step": 19180 }, { "epoch": 2.7249503264263413, "grad_norm": 2.712592601776123, "learning_rate": 1.1476145933546358e-05, "loss": 1.6934, "step": 19200 }, { "epoch": 2.727788816349702, "grad_norm": 2.6859164237976074, "learning_rate": 1.1461409966825124e-05, "loss": 1.7087, "step": 19220 }, { "epoch": 2.7306273062730626, "grad_norm": 2.708625078201294, "learning_rate": 1.1446670756700664e-05, "loss": 1.6651, "step": 19240 }, { "epoch": 2.7334657961964233, "grad_norm": 2.5579113960266113, "learning_rate": 1.1431928335884679e-05, "loss": 1.6921, "step": 19260 }, { "epoch": 2.7363042861197844, "grad_norm": 2.710376024246216, "learning_rate": 1.1417182737095991e-05, "loss": 1.6971, "step": 19280 }, { "epoch": 2.739142776043145, "grad_norm": 2.661865472793579, "learning_rate": 1.1402433993060475e-05, "loss": 1.6264, "step": 19300 }, { "epoch": 2.7419812659665057, "grad_norm": 2.677616596221924, "learning_rate": 1.1387682136510993e-05, "loss": 1.677, "step": 19320 }, { "epoch": 2.744819755889867, "grad_norm": 2.803898811340332, "learning_rate": 1.1372927200187307e-05, "loss": 1.6526, "step": 19340 }, { "epoch": 2.7476582458132275, "grad_norm": 2.648242473602295, "learning_rate": 1.1358169216836019e-05, "loss": 1.6589, "step": 19360 }, { "epoch": 2.750496735736588, "grad_norm": 2.8770012855529785, "learning_rate": 1.1343408219210488e-05, "loss": 1.639, "step": 19380 }, { "epoch": 2.753335225659949, "grad_norm": 2.621298313140869, "learning_rate": 1.1328644240070771e-05, "loss": 1.6663, "step": 19400 }, { "epoch": 2.7561737155833095, "grad_norm": 2.764375925064087, "learning_rate": 1.1313877312183536e-05, "loss": 1.6822, "step": 19420 }, { "epoch": 2.7590122055066706, "grad_norm": 2.7251837253570557, "learning_rate": 1.1299107468321997e-05, "loss": 1.5972, "step": 19440 }, { "epoch": 2.7618506954300313, "grad_norm": 2.4582672119140625, "learning_rate": 1.1284334741265838e-05, "loss": 1.6618, "step": 19460 }, { "epoch": 2.764689185353392, "grad_norm": 2.6001780033111572, "learning_rate": 1.1269559163801146e-05, "loss": 1.6398, "step": 19480 }, { "epoch": 2.767527675276753, "grad_norm": 2.7598631381988525, "learning_rate": 1.1254780768720331e-05, "loss": 1.6943, "step": 19500 }, { "epoch": 2.7703661652001137, "grad_norm": 2.683062791824341, "learning_rate": 1.1239999588822056e-05, "loss": 1.6545, "step": 19520 }, { "epoch": 2.7732046551234744, "grad_norm": 2.9445276260375977, "learning_rate": 1.1225215656911166e-05, "loss": 1.6525, "step": 19540 }, { "epoch": 2.776043145046835, "grad_norm": 2.7421555519104004, "learning_rate": 1.1210429005798613e-05, "loss": 1.6858, "step": 19560 }, { "epoch": 2.7788816349701957, "grad_norm": 2.801661491394043, "learning_rate": 1.1195639668301384e-05, "loss": 1.6743, "step": 19580 }, { "epoch": 2.781720124893557, "grad_norm": 2.7225821018218994, "learning_rate": 1.1180847677242425e-05, "loss": 1.622, "step": 19600 }, { "epoch": 2.7845586148169175, "grad_norm": 2.662832021713257, "learning_rate": 1.116605306545058e-05, "loss": 1.6984, "step": 19620 }, { "epoch": 2.787397104740278, "grad_norm": 2.7986505031585693, "learning_rate": 1.11512558657605e-05, "loss": 1.6995, "step": 19640 }, { "epoch": 2.790235594663639, "grad_norm": 2.8205065727233887, "learning_rate": 1.1136456111012584e-05, "loss": 1.677, "step": 19660 }, { "epoch": 2.7930740845869995, "grad_norm": 2.7117040157318115, "learning_rate": 1.1121653834052901e-05, "loss": 1.6482, "step": 19680 }, { "epoch": 2.7959125745103606, "grad_norm": 2.7424070835113525, "learning_rate": 1.1106849067733114e-05, "loss": 1.6568, "step": 19700 }, { "epoch": 2.7987510644337212, "grad_norm": 2.671940326690674, "learning_rate": 1.1092041844910422e-05, "loss": 1.6693, "step": 19720 }, { "epoch": 2.801589554357082, "grad_norm": 2.7280619144439697, "learning_rate": 1.107723219844746e-05, "loss": 1.6366, "step": 19740 }, { "epoch": 2.804428044280443, "grad_norm": 2.6971585750579834, "learning_rate": 1.1062420161212258e-05, "loss": 1.6983, "step": 19760 }, { "epoch": 2.8072665342038037, "grad_norm": 2.627903461456299, "learning_rate": 1.1047605766078137e-05, "loss": 1.6553, "step": 19780 }, { "epoch": 2.8101050241271643, "grad_norm": 2.809253454208374, "learning_rate": 1.1032789045923664e-05, "loss": 1.6719, "step": 19800 }, { "epoch": 2.812943514050525, "grad_norm": 2.7533586025238037, "learning_rate": 1.1017970033632554e-05, "loss": 1.697, "step": 19820 }, { "epoch": 2.8157820039738857, "grad_norm": 2.755267381668091, "learning_rate": 1.1003148762093618e-05, "loss": 1.6891, "step": 19840 }, { "epoch": 2.818620493897247, "grad_norm": 2.563636064529419, "learning_rate": 1.0988325264200684e-05, "loss": 1.6811, "step": 19860 }, { "epoch": 2.8214589838206074, "grad_norm": 2.6062510013580322, "learning_rate": 1.0973499572852505e-05, "loss": 1.618, "step": 19880 }, { "epoch": 2.824297473743968, "grad_norm": 2.5996932983398438, "learning_rate": 1.0958671720952721e-05, "loss": 1.6733, "step": 19900 }, { "epoch": 2.827135963667329, "grad_norm": 2.854038715362549, "learning_rate": 1.0943841741409754e-05, "loss": 1.6593, "step": 19920 }, { "epoch": 2.82997445359069, "grad_norm": 2.6900007724761963, "learning_rate": 1.0929009667136755e-05, "loss": 1.6702, "step": 19940 }, { "epoch": 2.8328129435140506, "grad_norm": 2.6370670795440674, "learning_rate": 1.0914175531051522e-05, "loss": 1.6859, "step": 19960 }, { "epoch": 2.835651433437411, "grad_norm": 2.4354281425476074, "learning_rate": 1.089933936607643e-05, "loss": 1.6344, "step": 19980 }, { "epoch": 2.838489923360772, "grad_norm": 2.820439577102661, "learning_rate": 1.0884501205138344e-05, "loss": 1.6964, "step": 20000 }, { "epoch": 2.841328413284133, "grad_norm": 2.754190444946289, "learning_rate": 1.0869661081168586e-05, "loss": 1.6316, "step": 20020 }, { "epoch": 2.8441669032074937, "grad_norm": 2.837782382965088, "learning_rate": 1.0854819027102811e-05, "loss": 1.6503, "step": 20040 }, { "epoch": 2.8470053931308543, "grad_norm": 2.9513120651245117, "learning_rate": 1.0839975075880967e-05, "loss": 1.6917, "step": 20060 }, { "epoch": 2.8498438830542154, "grad_norm": 2.638043165206909, "learning_rate": 1.0825129260447205e-05, "loss": 1.6252, "step": 20080 }, { "epoch": 2.8526823729775757, "grad_norm": 2.746992349624634, "learning_rate": 1.081028161374983e-05, "loss": 1.6557, "step": 20100 }, { "epoch": 2.8555208629009368, "grad_norm": 2.6401164531707764, "learning_rate": 1.0795432168741193e-05, "loss": 1.6845, "step": 20120 }, { "epoch": 2.8583593528242974, "grad_norm": 2.816802501678467, "learning_rate": 1.0780580958377644e-05, "loss": 1.6437, "step": 20140 }, { "epoch": 2.861197842747658, "grad_norm": 2.933926582336426, "learning_rate": 1.0765728015619454e-05, "loss": 1.669, "step": 20160 }, { "epoch": 2.864036332671019, "grad_norm": 2.626936674118042, "learning_rate": 1.0750873373430735e-05, "loss": 1.6549, "step": 20180 }, { "epoch": 2.86687482259438, "grad_norm": 2.6689255237579346, "learning_rate": 1.0736017064779369e-05, "loss": 1.6502, "step": 20200 }, { "epoch": 2.8697133125177405, "grad_norm": 2.801917791366577, "learning_rate": 1.072115912263694e-05, "loss": 1.7137, "step": 20220 }, { "epoch": 2.872551802441101, "grad_norm": 2.873746871948242, "learning_rate": 1.070629957997865e-05, "loss": 1.6539, "step": 20240 }, { "epoch": 2.875390292364462, "grad_norm": 2.6104683876037598, "learning_rate": 1.0691438469783268e-05, "loss": 1.6902, "step": 20260 }, { "epoch": 2.878228782287823, "grad_norm": 2.7353999614715576, "learning_rate": 1.0676575825033029e-05, "loss": 1.6558, "step": 20280 }, { "epoch": 2.8810672722111836, "grad_norm": 2.764976739883423, "learning_rate": 1.0661711678713575e-05, "loss": 1.694, "step": 20300 }, { "epoch": 2.8839057621345443, "grad_norm": 2.732469081878662, "learning_rate": 1.0646846063813889e-05, "loss": 1.7176, "step": 20320 }, { "epoch": 2.8867442520579054, "grad_norm": 2.728438138961792, "learning_rate": 1.0631979013326205e-05, "loss": 1.6396, "step": 20340 }, { "epoch": 2.889582741981266, "grad_norm": 2.7057816982269287, "learning_rate": 1.0617110560245944e-05, "loss": 1.6721, "step": 20360 }, { "epoch": 2.8924212319046267, "grad_norm": 2.6167151927948, "learning_rate": 1.0602240737571645e-05, "loss": 1.6584, "step": 20380 }, { "epoch": 2.8952597218279874, "grad_norm": 2.7295291423797607, "learning_rate": 1.0587369578304879e-05, "loss": 1.6519, "step": 20400 }, { "epoch": 2.898098211751348, "grad_norm": 2.8099365234375, "learning_rate": 1.0572497115450188e-05, "loss": 1.6652, "step": 20420 }, { "epoch": 2.900936701674709, "grad_norm": 2.541081190109253, "learning_rate": 1.0557623382015012e-05, "loss": 1.651, "step": 20440 }, { "epoch": 2.90377519159807, "grad_norm": 2.899733066558838, "learning_rate": 1.0542748411009596e-05, "loss": 1.6626, "step": 20460 }, { "epoch": 2.9066136815214305, "grad_norm": 2.7668466567993164, "learning_rate": 1.052787223544695e-05, "loss": 1.662, "step": 20480 }, { "epoch": 2.9094521714447916, "grad_norm": 2.7802133560180664, "learning_rate": 1.0512994888342745e-05, "loss": 1.6912, "step": 20500 }, { "epoch": 2.9122906613681523, "grad_norm": 2.659390449523926, "learning_rate": 1.0498116402715254e-05, "loss": 1.69, "step": 20520 }, { "epoch": 2.915129151291513, "grad_norm": 2.740283250808716, "learning_rate": 1.048323681158528e-05, "loss": 1.6867, "step": 20540 }, { "epoch": 2.9179676412148736, "grad_norm": 2.7201805114746094, "learning_rate": 1.0468356147976079e-05, "loss": 1.6678, "step": 20560 }, { "epoch": 2.9208061311382343, "grad_norm": 2.652395725250244, "learning_rate": 1.0453474444913288e-05, "loss": 1.6633, "step": 20580 }, { "epoch": 2.9236446210615954, "grad_norm": 2.6902647018432617, "learning_rate": 1.0438591735424843e-05, "loss": 1.6821, "step": 20600 }, { "epoch": 2.926483110984956, "grad_norm": 2.688502788543701, "learning_rate": 1.0423708052540924e-05, "loss": 1.6104, "step": 20620 }, { "epoch": 2.9293216009083167, "grad_norm": 2.742502450942993, "learning_rate": 1.0408823429293866e-05, "loss": 1.6085, "step": 20640 }, { "epoch": 2.9321600908316774, "grad_norm": 2.8404297828674316, "learning_rate": 1.0393937898718091e-05, "loss": 1.6574, "step": 20660 }, { "epoch": 2.934998580755038, "grad_norm": 2.804955244064331, "learning_rate": 1.0379051493850039e-05, "loss": 1.6558, "step": 20680 }, { "epoch": 2.937837070678399, "grad_norm": 2.683284282684326, "learning_rate": 1.0364164247728082e-05, "loss": 1.6329, "step": 20700 }, { "epoch": 2.94067556060176, "grad_norm": 2.7313232421875, "learning_rate": 1.0349276193392467e-05, "loss": 1.6424, "step": 20720 }, { "epoch": 2.9435140505251205, "grad_norm": 2.8659298419952393, "learning_rate": 1.0334387363885232e-05, "loss": 1.6899, "step": 20740 }, { "epoch": 2.9463525404484816, "grad_norm": 2.645648717880249, "learning_rate": 1.0319497792250138e-05, "loss": 1.6336, "step": 20760 }, { "epoch": 2.9491910303718423, "grad_norm": 2.6986279487609863, "learning_rate": 1.030460751153258e-05, "loss": 1.6474, "step": 20780 }, { "epoch": 2.952029520295203, "grad_norm": 2.698859691619873, "learning_rate": 1.0289716554779551e-05, "loss": 1.6576, "step": 20800 }, { "epoch": 2.9548680102185636, "grad_norm": 2.71421480178833, "learning_rate": 1.0274824955039524e-05, "loss": 1.6429, "step": 20820 }, { "epoch": 2.9577065001419243, "grad_norm": 2.7509477138519287, "learning_rate": 1.0259932745362399e-05, "loss": 1.6339, "step": 20840 }, { "epoch": 2.9605449900652854, "grad_norm": 2.608607292175293, "learning_rate": 1.0245039958799447e-05, "loss": 1.6018, "step": 20860 }, { "epoch": 2.963383479988646, "grad_norm": 2.560154914855957, "learning_rate": 1.0230146628403201e-05, "loss": 1.6565, "step": 20880 }, { "epoch": 2.9662219699120067, "grad_norm": 2.5181398391723633, "learning_rate": 1.0215252787227414e-05, "loss": 1.6805, "step": 20900 }, { "epoch": 2.969060459835368, "grad_norm": 2.696845769882202, "learning_rate": 1.020035846832696e-05, "loss": 1.6631, "step": 20920 }, { "epoch": 2.9718989497587285, "grad_norm": 2.786024808883667, "learning_rate": 1.0185463704757785e-05, "loss": 1.6894, "step": 20940 }, { "epoch": 2.974737439682089, "grad_norm": 2.8508172035217285, "learning_rate": 1.0170568529576816e-05, "loss": 1.6771, "step": 20960 }, { "epoch": 2.97757592960545, "grad_norm": 2.8293559551239014, "learning_rate": 1.0155672975841896e-05, "loss": 1.7451, "step": 20980 }, { "epoch": 2.9804144195288105, "grad_norm": 2.6233816146850586, "learning_rate": 1.0140777076611706e-05, "loss": 1.6802, "step": 21000 }, { "epoch": 2.9832529094521716, "grad_norm": 2.6177377700805664, "learning_rate": 1.0125880864945694e-05, "loss": 1.6691, "step": 21020 }, { "epoch": 2.9860913993755323, "grad_norm": 2.616244316101074, "learning_rate": 1.0110984373904e-05, "loss": 1.6562, "step": 21040 }, { "epoch": 2.988929889298893, "grad_norm": 2.8227474689483643, "learning_rate": 1.0096087636547389e-05, "loss": 1.6829, "step": 21060 }, { "epoch": 2.991768379222254, "grad_norm": 2.9333550930023193, "learning_rate": 1.0081190685937168e-05, "loss": 1.6077, "step": 21080 }, { "epoch": 2.9946068691456147, "grad_norm": 2.5948550701141357, "learning_rate": 1.006629355513512e-05, "loss": 1.6416, "step": 21100 }, { "epoch": 2.9974453590689754, "grad_norm": 2.77489972114563, "learning_rate": 1.0051396277203427e-05, "loss": 1.688, "step": 21120 }, { "epoch": 3.000283848992336, "grad_norm": 2.8716676235198975, "learning_rate": 1.0036498885204598e-05, "loss": 1.6653, "step": 21140 }, { "epoch": 3.0031223389156967, "grad_norm": 3.2379279136657715, "learning_rate": 1.002160141220139e-05, "loss": 1.4773, "step": 21160 }, { "epoch": 3.005960828839058, "grad_norm": 3.159872531890869, "learning_rate": 1.0006703891256748e-05, "loss": 1.3809, "step": 21180 }, { "epoch": 3.0087993187624185, "grad_norm": 3.2384188175201416, "learning_rate": 9.991806355433722e-06, "loss": 1.4283, "step": 21200 }, { "epoch": 3.011637808685779, "grad_norm": 3.1655704975128174, "learning_rate": 9.976908837795388e-06, "loss": 1.4144, "step": 21220 }, { "epoch": 3.01447629860914, "grad_norm": 3.155158042907715, "learning_rate": 9.96201137140479e-06, "loss": 1.4388, "step": 21240 }, { "epoch": 3.017314788532501, "grad_norm": 3.256465435028076, "learning_rate": 9.947113989324852e-06, "loss": 1.3952, "step": 21260 }, { "epoch": 3.0201532784558616, "grad_norm": 3.3961892127990723, "learning_rate": 9.932216724618316e-06, "loss": 1.4026, "step": 21280 }, { "epoch": 3.0229917683792222, "grad_norm": 3.2471697330474854, "learning_rate": 9.917319610347657e-06, "loss": 1.4027, "step": 21300 }, { "epoch": 3.025830258302583, "grad_norm": 3.0608444213867188, "learning_rate": 9.902422679575027e-06, "loss": 1.4102, "step": 21320 }, { "epoch": 3.028668748225944, "grad_norm": 3.396030902862549, "learning_rate": 9.887525965362154e-06, "loss": 1.4263, "step": 21340 }, { "epoch": 3.0315072381493047, "grad_norm": 3.2044589519500732, "learning_rate": 9.872629500770304e-06, "loss": 1.4441, "step": 21360 }, { "epoch": 3.0343457280726653, "grad_norm": 3.1486377716064453, "learning_rate": 9.85773331886018e-06, "loss": 1.3919, "step": 21380 }, { "epoch": 3.037184217996026, "grad_norm": 3.218489646911621, "learning_rate": 9.84283745269185e-06, "loss": 1.4084, "step": 21400 }, { "epoch": 3.0400227079193867, "grad_norm": 3.371188163757324, "learning_rate": 9.827941935324699e-06, "loss": 1.4191, "step": 21420 }, { "epoch": 3.042861197842748, "grad_norm": 3.489412546157837, "learning_rate": 9.813046799817322e-06, "loss": 1.4476, "step": 21440 }, { "epoch": 3.0456996877661084, "grad_norm": 3.2109546661376953, "learning_rate": 9.798152079227475e-06, "loss": 1.4324, "step": 21460 }, { "epoch": 3.048538177689469, "grad_norm": 3.3935282230377197, "learning_rate": 9.78325780661199e-06, "loss": 1.42, "step": 21480 }, { "epoch": 3.0513766676128298, "grad_norm": 3.57053542137146, "learning_rate": 9.768364015026701e-06, "loss": 1.3939, "step": 21500 }, { "epoch": 3.054215157536191, "grad_norm": 3.337756395339966, "learning_rate": 9.753470737526385e-06, "loss": 1.3893, "step": 21520 }, { "epoch": 3.0570536474595515, "grad_norm": 3.060789108276367, "learning_rate": 9.738578007164672e-06, "loss": 1.3908, "step": 21540 }, { "epoch": 3.059892137382912, "grad_norm": 3.4710278511047363, "learning_rate": 9.723685856993974e-06, "loss": 1.4292, "step": 21560 }, { "epoch": 3.062730627306273, "grad_norm": 3.3397037982940674, "learning_rate": 9.70879432006542e-06, "loss": 1.4025, "step": 21580 }, { "epoch": 3.065569117229634, "grad_norm": 3.4931745529174805, "learning_rate": 9.693903429428781e-06, "loss": 1.4523, "step": 21600 }, { "epoch": 3.0684076071529947, "grad_norm": 3.404431104660034, "learning_rate": 9.679013218132382e-06, "loss": 1.3905, "step": 21620 }, { "epoch": 3.0712460970763553, "grad_norm": 3.200284004211426, "learning_rate": 9.664123719223059e-06, "loss": 1.4365, "step": 21640 }, { "epoch": 3.074084586999716, "grad_norm": 3.4462153911590576, "learning_rate": 9.649234965746045e-06, "loss": 1.4448, "step": 21660 }, { "epoch": 3.076923076923077, "grad_norm": 3.3211820125579834, "learning_rate": 9.634346990744932e-06, "loss": 1.4551, "step": 21680 }, { "epoch": 3.0797615668464378, "grad_norm": 3.090240001678467, "learning_rate": 9.619459827261588e-06, "loss": 1.3528, "step": 21700 }, { "epoch": 3.0826000567697984, "grad_norm": 3.3265163898468018, "learning_rate": 9.604573508336065e-06, "loss": 1.4615, "step": 21720 }, { "epoch": 3.085438546693159, "grad_norm": 3.297335386276245, "learning_rate": 9.589688067006553e-06, "loss": 1.4082, "step": 21740 }, { "epoch": 3.08827703661652, "grad_norm": 3.3396027088165283, "learning_rate": 9.574803536309297e-06, "loss": 1.4191, "step": 21760 }, { "epoch": 3.091115526539881, "grad_norm": 3.3462960720062256, "learning_rate": 9.559919949278503e-06, "loss": 1.4275, "step": 21780 }, { "epoch": 3.0939540164632415, "grad_norm": 3.384568452835083, "learning_rate": 9.545037338946304e-06, "loss": 1.4169, "step": 21800 }, { "epoch": 3.096792506386602, "grad_norm": 3.330578327178955, "learning_rate": 9.530155738342648e-06, "loss": 1.3901, "step": 21820 }, { "epoch": 3.0996309963099633, "grad_norm": 3.1881208419799805, "learning_rate": 9.515275180495255e-06, "loss": 1.4109, "step": 21840 }, { "epoch": 3.102469486233324, "grad_norm": 3.18969464302063, "learning_rate": 9.500395698429525e-06, "loss": 1.4161, "step": 21860 }, { "epoch": 3.1053079761566846, "grad_norm": 3.2706992626190186, "learning_rate": 9.48551732516847e-06, "loss": 1.388, "step": 21880 }, { "epoch": 3.1081464660800453, "grad_norm": 3.3498735427856445, "learning_rate": 9.470640093732639e-06, "loss": 1.4706, "step": 21900 }, { "epoch": 3.1109849560034064, "grad_norm": 3.349871873855591, "learning_rate": 9.455764037140055e-06, "loss": 1.3925, "step": 21920 }, { "epoch": 3.113823445926767, "grad_norm": 2.9981496334075928, "learning_rate": 9.440889188406124e-06, "loss": 1.3733, "step": 21940 }, { "epoch": 3.1166619358501277, "grad_norm": 3.2593114376068115, "learning_rate": 9.426015580543577e-06, "loss": 1.3959, "step": 21960 }, { "epoch": 3.1195004257734884, "grad_norm": 3.307849645614624, "learning_rate": 9.411143246562393e-06, "loss": 1.3883, "step": 21980 }, { "epoch": 3.122338915696849, "grad_norm": 3.4641823768615723, "learning_rate": 9.396272219469716e-06, "loss": 1.4056, "step": 22000 }, { "epoch": 3.12517740562021, "grad_norm": 3.2760772705078125, "learning_rate": 9.3814025322698e-06, "loss": 1.4296, "step": 22020 }, { "epoch": 3.128015895543571, "grad_norm": 3.274237632751465, "learning_rate": 9.366534217963912e-06, "loss": 1.4281, "step": 22040 }, { "epoch": 3.1308543854669315, "grad_norm": 3.4149110317230225, "learning_rate": 9.351667309550285e-06, "loss": 1.3817, "step": 22060 }, { "epoch": 3.133692875390292, "grad_norm": 3.2452287673950195, "learning_rate": 9.336801840024025e-06, "loss": 1.4125, "step": 22080 }, { "epoch": 3.1365313653136533, "grad_norm": 3.2653396129608154, "learning_rate": 9.321937842377047e-06, "loss": 1.3909, "step": 22100 }, { "epoch": 3.139369855237014, "grad_norm": 3.0482804775238037, "learning_rate": 9.307075349597993e-06, "loss": 1.4049, "step": 22120 }, { "epoch": 3.1422083451603746, "grad_norm": 3.294626474380493, "learning_rate": 9.292214394672178e-06, "loss": 1.4056, "step": 22140 }, { "epoch": 3.1450468350837353, "grad_norm": 3.4682154655456543, "learning_rate": 9.27735501058149e-06, "loss": 1.3705, "step": 22160 }, { "epoch": 3.1478853250070964, "grad_norm": 3.3536932468414307, "learning_rate": 9.262497230304343e-06, "loss": 1.4289, "step": 22180 }, { "epoch": 3.150723814930457, "grad_norm": 3.439310073852539, "learning_rate": 9.247641086815578e-06, "loss": 1.3797, "step": 22200 }, { "epoch": 3.1535623048538177, "grad_norm": 3.1599812507629395, "learning_rate": 9.232786613086416e-06, "loss": 1.4265, "step": 22220 }, { "epoch": 3.1564007947771784, "grad_norm": 3.2216975688934326, "learning_rate": 9.21793384208437e-06, "loss": 1.4176, "step": 22240 }, { "epoch": 3.1592392847005395, "grad_norm": 2.9417402744293213, "learning_rate": 9.203082806773164e-06, "loss": 1.4016, "step": 22260 }, { "epoch": 3.1620777746239, "grad_norm": 3.1937625408172607, "learning_rate": 9.188233540112683e-06, "loss": 1.3904, "step": 22280 }, { "epoch": 3.164916264547261, "grad_norm": 3.346270799636841, "learning_rate": 9.173386075058879e-06, "loss": 1.4261, "step": 22300 }, { "epoch": 3.1677547544706215, "grad_norm": 3.351668119430542, "learning_rate": 9.158540444563706e-06, "loss": 1.4742, "step": 22320 }, { "epoch": 3.1705932443939826, "grad_norm": 3.3080108165740967, "learning_rate": 9.143696681575052e-06, "loss": 1.4322, "step": 22340 }, { "epoch": 3.1734317343173433, "grad_norm": 3.11556077003479, "learning_rate": 9.128854819036647e-06, "loss": 1.3613, "step": 22360 }, { "epoch": 3.176270224240704, "grad_norm": 3.347236394882202, "learning_rate": 9.114014889888021e-06, "loss": 1.4332, "step": 22380 }, { "epoch": 3.1791087141640646, "grad_norm": 3.163140296936035, "learning_rate": 9.099176927064407e-06, "loss": 1.3874, "step": 22400 }, { "epoch": 3.1819472040874253, "grad_norm": 3.4615697860717773, "learning_rate": 9.084340963496659e-06, "loss": 1.4453, "step": 22420 }, { "epoch": 3.1847856940107864, "grad_norm": 3.333786964416504, "learning_rate": 9.069507032111215e-06, "loss": 1.4407, "step": 22440 }, { "epoch": 3.187624183934147, "grad_norm": 3.491029977798462, "learning_rate": 9.054675165829998e-06, "loss": 1.4683, "step": 22460 }, { "epoch": 3.1904626738575077, "grad_norm": 3.4206972122192383, "learning_rate": 9.039845397570332e-06, "loss": 1.4319, "step": 22480 }, { "epoch": 3.193301163780869, "grad_norm": 3.2368667125701904, "learning_rate": 9.025017760244909e-06, "loss": 1.4092, "step": 22500 }, { "epoch": 3.1961396537042295, "grad_norm": 3.2499349117279053, "learning_rate": 9.010192286761675e-06, "loss": 1.4281, "step": 22520 }, { "epoch": 3.19897814362759, "grad_norm": 3.4435946941375732, "learning_rate": 8.995369010023771e-06, "loss": 1.4464, "step": 22540 }, { "epoch": 3.201816633550951, "grad_norm": 3.367335557937622, "learning_rate": 8.980547962929482e-06, "loss": 1.4133, "step": 22560 }, { "epoch": 3.2046551234743115, "grad_norm": 3.3545525074005127, "learning_rate": 8.965729178372122e-06, "loss": 1.459, "step": 22580 }, { "epoch": 3.2074936133976726, "grad_norm": 3.382192611694336, "learning_rate": 8.950912689239998e-06, "loss": 1.4206, "step": 22600 }, { "epoch": 3.2103321033210332, "grad_norm": 3.4491827487945557, "learning_rate": 8.936098528416322e-06, "loss": 1.4749, "step": 22620 }, { "epoch": 3.213170593244394, "grad_norm": 3.2631564140319824, "learning_rate": 8.921286728779129e-06, "loss": 1.4104, "step": 22640 }, { "epoch": 3.2160090831677546, "grad_norm": 3.622498035430908, "learning_rate": 8.90647732320122e-06, "loss": 1.4005, "step": 22660 }, { "epoch": 3.2188475730911157, "grad_norm": 3.5100960731506348, "learning_rate": 8.89167034455009e-06, "loss": 1.4285, "step": 22680 }, { "epoch": 3.2216860630144764, "grad_norm": 3.2971818447113037, "learning_rate": 8.876865825687834e-06, "loss": 1.4244, "step": 22700 }, { "epoch": 3.224524552937837, "grad_norm": 3.5792524814605713, "learning_rate": 8.862063799471095e-06, "loss": 1.4257, "step": 22720 }, { "epoch": 3.2273630428611977, "grad_norm": 3.335270404815674, "learning_rate": 8.847264298750983e-06, "loss": 1.4225, "step": 22740 }, { "epoch": 3.230201532784559, "grad_norm": 3.5463216304779053, "learning_rate": 8.832467356373e-06, "loss": 1.4467, "step": 22760 }, { "epoch": 3.2330400227079195, "grad_norm": 3.320298194885254, "learning_rate": 8.817673005176982e-06, "loss": 1.388, "step": 22780 }, { "epoch": 3.23587851263128, "grad_norm": 2.987459897994995, "learning_rate": 8.802881277996996e-06, "loss": 1.3844, "step": 22800 }, { "epoch": 3.238717002554641, "grad_norm": 3.440152406692505, "learning_rate": 8.788092207661296e-06, "loss": 1.4118, "step": 22820 }, { "epoch": 3.241555492478002, "grad_norm": 3.3504679203033447, "learning_rate": 8.773305826992246e-06, "loss": 1.4047, "step": 22840 }, { "epoch": 3.2443939824013626, "grad_norm": 3.548917770385742, "learning_rate": 8.758522168806222e-06, "loss": 1.4192, "step": 22860 }, { "epoch": 3.2472324723247232, "grad_norm": 3.5502102375030518, "learning_rate": 8.743741265913579e-06, "loss": 1.4656, "step": 22880 }, { "epoch": 3.250070962248084, "grad_norm": 3.322138547897339, "learning_rate": 8.728963151118538e-06, "loss": 1.4287, "step": 22900 }, { "epoch": 3.252909452171445, "grad_norm": 3.433283567428589, "learning_rate": 8.714187857219143e-06, "loss": 1.4207, "step": 22920 }, { "epoch": 3.2557479420948057, "grad_norm": 3.5198066234588623, "learning_rate": 8.699415417007179e-06, "loss": 1.4234, "step": 22940 }, { "epoch": 3.2585864320181663, "grad_norm": 3.4414491653442383, "learning_rate": 8.68464586326809e-06, "loss": 1.4021, "step": 22960 }, { "epoch": 3.261424921941527, "grad_norm": 3.4489943981170654, "learning_rate": 8.669879228780918e-06, "loss": 1.4072, "step": 22980 }, { "epoch": 3.2642634118648877, "grad_norm": 3.3313164710998535, "learning_rate": 8.655115546318232e-06, "loss": 1.4498, "step": 23000 }, { "epoch": 3.2671019017882488, "grad_norm": 3.3813352584838867, "learning_rate": 8.640354848646032e-06, "loss": 1.4371, "step": 23020 }, { "epoch": 3.2699403917116094, "grad_norm": 3.29073166847229, "learning_rate": 8.625597168523711e-06, "loss": 1.3786, "step": 23040 }, { "epoch": 3.27277888163497, "grad_norm": 3.370516538619995, "learning_rate": 8.610842538703963e-06, "loss": 1.4334, "step": 23060 }, { "epoch": 3.275617371558331, "grad_norm": 3.400479793548584, "learning_rate": 8.5960909919327e-06, "loss": 1.4409, "step": 23080 }, { "epoch": 3.278455861481692, "grad_norm": 3.5510811805725098, "learning_rate": 8.581342560949006e-06, "loss": 1.4219, "step": 23100 }, { "epoch": 3.2812943514050525, "grad_norm": 3.412184476852417, "learning_rate": 8.566597278485036e-06, "loss": 1.4379, "step": 23120 }, { "epoch": 3.284132841328413, "grad_norm": 3.368407726287842, "learning_rate": 8.551855177265975e-06, "loss": 1.442, "step": 23140 }, { "epoch": 3.286971331251774, "grad_norm": 3.4107041358947754, "learning_rate": 8.53711629000993e-06, "loss": 1.4545, "step": 23160 }, { "epoch": 3.289809821175135, "grad_norm": 3.6424269676208496, "learning_rate": 8.522380649427884e-06, "loss": 1.4254, "step": 23180 }, { "epoch": 3.2926483110984957, "grad_norm": 3.4802629947662354, "learning_rate": 8.507648288223612e-06, "loss": 1.3974, "step": 23200 }, { "epoch": 3.2954868010218563, "grad_norm": 3.3685178756713867, "learning_rate": 8.49291923909362e-06, "loss": 1.4114, "step": 23220 }, { "epoch": 3.298325290945217, "grad_norm": 3.34666109085083, "learning_rate": 8.478193534727041e-06, "loss": 1.4106, "step": 23240 }, { "epoch": 3.301163780868578, "grad_norm": 3.324201822280884, "learning_rate": 8.463471207805614e-06, "loss": 1.449, "step": 23260 }, { "epoch": 3.3040022707919388, "grad_norm": 3.198169469833374, "learning_rate": 8.448752291003553e-06, "loss": 1.3964, "step": 23280 }, { "epoch": 3.3068407607152994, "grad_norm": 3.3567185401916504, "learning_rate": 8.434036816987526e-06, "loss": 1.4191, "step": 23300 }, { "epoch": 3.30967925063866, "grad_norm": 3.55808687210083, "learning_rate": 8.419324818416554e-06, "loss": 1.418, "step": 23320 }, { "epoch": 3.312517740562021, "grad_norm": 3.2458930015563965, "learning_rate": 8.404616327941934e-06, "loss": 1.4639, "step": 23340 }, { "epoch": 3.315356230485382, "grad_norm": 3.5113108158111572, "learning_rate": 8.389911378207195e-06, "loss": 1.442, "step": 23360 }, { "epoch": 3.3181947204087425, "grad_norm": 3.453336238861084, "learning_rate": 8.375210001847997e-06, "loss": 1.4115, "step": 23380 }, { "epoch": 3.321033210332103, "grad_norm": 3.6144471168518066, "learning_rate": 8.360512231492063e-06, "loss": 1.4052, "step": 23400 }, { "epoch": 3.323871700255464, "grad_norm": 3.2711427211761475, "learning_rate": 8.345818099759132e-06, "loss": 1.4032, "step": 23420 }, { "epoch": 3.326710190178825, "grad_norm": 3.2941811084747314, "learning_rate": 8.33112763926085e-06, "loss": 1.4101, "step": 23440 }, { "epoch": 3.3295486801021856, "grad_norm": 3.2961928844451904, "learning_rate": 8.31644088260072e-06, "loss": 1.4191, "step": 23460 }, { "epoch": 3.3323871700255463, "grad_norm": 3.362497329711914, "learning_rate": 8.301757862374038e-06, "loss": 1.4855, "step": 23480 }, { "epoch": 3.3352256599489074, "grad_norm": 3.3870720863342285, "learning_rate": 8.287078611167782e-06, "loss": 1.4224, "step": 23500 }, { "epoch": 3.338064149872268, "grad_norm": 3.3629488945007324, "learning_rate": 8.272403161560586e-06, "loss": 1.4035, "step": 23520 }, { "epoch": 3.3409026397956287, "grad_norm": 3.5052473545074463, "learning_rate": 8.257731546122647e-06, "loss": 1.4368, "step": 23540 }, { "epoch": 3.3437411297189894, "grad_norm": 3.4093003273010254, "learning_rate": 8.243063797415635e-06, "loss": 1.4024, "step": 23560 }, { "epoch": 3.34657961964235, "grad_norm": 3.056823492050171, "learning_rate": 8.22839994799265e-06, "loss": 1.4096, "step": 23580 }, { "epoch": 3.349418109565711, "grad_norm": 3.461947441101074, "learning_rate": 8.213740030398146e-06, "loss": 1.4514, "step": 23600 }, { "epoch": 3.352256599489072, "grad_norm": 3.539713144302368, "learning_rate": 8.199084077167829e-06, "loss": 1.4116, "step": 23620 }, { "epoch": 3.3550950894124325, "grad_norm": 3.372297525405884, "learning_rate": 8.184432120828633e-06, "loss": 1.4567, "step": 23640 }, { "epoch": 3.357933579335793, "grad_norm": 3.401728391647339, "learning_rate": 8.169784193898598e-06, "loss": 1.4464, "step": 23660 }, { "epoch": 3.3607720692591543, "grad_norm": 3.4011144638061523, "learning_rate": 8.155140328886834e-06, "loss": 1.4426, "step": 23680 }, { "epoch": 3.363610559182515, "grad_norm": 3.7290472984313965, "learning_rate": 8.14050055829344e-06, "loss": 1.3961, "step": 23700 }, { "epoch": 3.3664490491058756, "grad_norm": 3.317993402481079, "learning_rate": 8.12586491460941e-06, "loss": 1.4263, "step": 23720 }, { "epoch": 3.3692875390292363, "grad_norm": 3.387925624847412, "learning_rate": 8.1112334303166e-06, "loss": 1.4108, "step": 23740 }, { "epoch": 3.3721260289525974, "grad_norm": 3.4022109508514404, "learning_rate": 8.09660613788762e-06, "loss": 1.4485, "step": 23760 }, { "epoch": 3.374964518875958, "grad_norm": 3.285409688949585, "learning_rate": 8.081983069785784e-06, "loss": 1.4326, "step": 23780 }, { "epoch": 3.3778030087993187, "grad_norm": 3.239304542541504, "learning_rate": 8.067364258465028e-06, "loss": 1.4021, "step": 23800 }, { "epoch": 3.3806414987226794, "grad_norm": 3.1347999572753906, "learning_rate": 8.052749736369838e-06, "loss": 1.4051, "step": 23820 }, { "epoch": 3.3834799886460405, "grad_norm": 3.4634714126586914, "learning_rate": 8.038139535935186e-06, "loss": 1.4328, "step": 23840 }, { "epoch": 3.386318478569401, "grad_norm": 3.5181312561035156, "learning_rate": 8.023533689586455e-06, "loss": 1.4285, "step": 23860 }, { "epoch": 3.389156968492762, "grad_norm": 3.5183846950531006, "learning_rate": 8.008932229739352e-06, "loss": 1.4352, "step": 23880 }, { "epoch": 3.3919954584161225, "grad_norm": 3.1833486557006836, "learning_rate": 7.994335188799863e-06, "loss": 1.4354, "step": 23900 }, { "epoch": 3.3948339483394836, "grad_norm": 3.5220303535461426, "learning_rate": 7.97974259916416e-06, "loss": 1.4262, "step": 23920 }, { "epoch": 3.3976724382628443, "grad_norm": 3.452599287033081, "learning_rate": 7.96515449321853e-06, "loss": 1.4257, "step": 23940 }, { "epoch": 3.400510928186205, "grad_norm": 3.441765069961548, "learning_rate": 7.950570903339325e-06, "loss": 1.397, "step": 23960 }, { "epoch": 3.4033494181095656, "grad_norm": 3.3197898864746094, "learning_rate": 7.93599186189286e-06, "loss": 1.4442, "step": 23980 }, { "epoch": 3.4061879080329263, "grad_norm": 3.269554615020752, "learning_rate": 7.921417401235362e-06, "loss": 1.4483, "step": 24000 }, { "epoch": 3.4090263979562874, "grad_norm": 3.188277006149292, "learning_rate": 7.906847553712887e-06, "loss": 1.4569, "step": 24020 }, { "epoch": 3.411864887879648, "grad_norm": 3.303219795227051, "learning_rate": 7.892282351661259e-06, "loss": 1.4418, "step": 24040 }, { "epoch": 3.4147033778030087, "grad_norm": 3.2643752098083496, "learning_rate": 7.877721827405983e-06, "loss": 1.4562, "step": 24060 }, { "epoch": 3.41754186772637, "grad_norm": 3.2512366771698, "learning_rate": 7.863166013262196e-06, "loss": 1.4139, "step": 24080 }, { "epoch": 3.4203803576497305, "grad_norm": 3.594169855117798, "learning_rate": 7.848614941534566e-06, "loss": 1.4443, "step": 24100 }, { "epoch": 3.423218847573091, "grad_norm": 3.525327205657959, "learning_rate": 7.834068644517245e-06, "loss": 1.4231, "step": 24120 }, { "epoch": 3.426057337496452, "grad_norm": 3.4014298915863037, "learning_rate": 7.819527154493794e-06, "loss": 1.4138, "step": 24140 }, { "epoch": 3.4288958274198125, "grad_norm": 3.501582384109497, "learning_rate": 7.804990503737084e-06, "loss": 1.4166, "step": 24160 }, { "epoch": 3.4317343173431736, "grad_norm": 3.5644009113311768, "learning_rate": 7.790458724509273e-06, "loss": 1.4323, "step": 24180 }, { "epoch": 3.4345728072665342, "grad_norm": 3.568424701690674, "learning_rate": 7.775931849061684e-06, "loss": 1.3956, "step": 24200 }, { "epoch": 3.437411297189895, "grad_norm": 3.2916793823242188, "learning_rate": 7.761409909634773e-06, "loss": 1.4163, "step": 24220 }, { "epoch": 3.4402497871132556, "grad_norm": 3.291956663131714, "learning_rate": 7.746892938458036e-06, "loss": 1.4327, "step": 24240 }, { "epoch": 3.4430882770366167, "grad_norm": 3.3634932041168213, "learning_rate": 7.732380967749932e-06, "loss": 1.4411, "step": 24260 }, { "epoch": 3.4459267669599773, "grad_norm": 3.469589948654175, "learning_rate": 7.717874029717841e-06, "loss": 1.4539, "step": 24280 }, { "epoch": 3.448765256883338, "grad_norm": 3.1999857425689697, "learning_rate": 7.703372156557965e-06, "loss": 1.4238, "step": 24300 }, { "epoch": 3.4516037468066987, "grad_norm": 3.1920242309570312, "learning_rate": 7.688875380455257e-06, "loss": 1.3745, "step": 24320 }, { "epoch": 3.45444223673006, "grad_norm": 3.3747425079345703, "learning_rate": 7.674383733583374e-06, "loss": 1.4239, "step": 24340 }, { "epoch": 3.4572807266534205, "grad_norm": 3.549278736114502, "learning_rate": 7.659897248104572e-06, "loss": 1.4053, "step": 24360 }, { "epoch": 3.460119216576781, "grad_norm": 3.546895742416382, "learning_rate": 7.645415956169665e-06, "loss": 1.4512, "step": 24380 }, { "epoch": 3.462957706500142, "grad_norm": 3.4185640811920166, "learning_rate": 7.630939889917941e-06, "loss": 1.4309, "step": 24400 }, { "epoch": 3.4657961964235025, "grad_norm": 3.504279375076294, "learning_rate": 7.616469081477079e-06, "loss": 1.4115, "step": 24420 }, { "epoch": 3.4686346863468636, "grad_norm": 3.483107089996338, "learning_rate": 7.602003562963095e-06, "loss": 1.4422, "step": 24440 }, { "epoch": 3.4714731762702242, "grad_norm": 3.1758716106414795, "learning_rate": 7.5875433664802725e-06, "loss": 1.4112, "step": 24460 }, { "epoch": 3.474311666193585, "grad_norm": 3.4936881065368652, "learning_rate": 7.573088524121067e-06, "loss": 1.4389, "step": 24480 }, { "epoch": 3.477150156116946, "grad_norm": 3.5021684169769287, "learning_rate": 7.558639067966066e-06, "loss": 1.4276, "step": 24500 }, { "epoch": 3.4799886460403067, "grad_norm": 3.3315179347991943, "learning_rate": 7.544195030083901e-06, "loss": 1.4293, "step": 24520 }, { "epoch": 3.4828271359636673, "grad_norm": 3.4236621856689453, "learning_rate": 7.529756442531164e-06, "loss": 1.4066, "step": 24540 }, { "epoch": 3.485665625887028, "grad_norm": 3.5500783920288086, "learning_rate": 7.515323337352376e-06, "loss": 1.4299, "step": 24560 }, { "epoch": 3.4885041158103887, "grad_norm": 3.6396684646606445, "learning_rate": 7.500895746579863e-06, "loss": 1.4463, "step": 24580 }, { "epoch": 3.4913426057337498, "grad_norm": 3.4823622703552246, "learning_rate": 7.486473702233733e-06, "loss": 1.4628, "step": 24600 }, { "epoch": 3.4941810956571104, "grad_norm": 3.4900810718536377, "learning_rate": 7.472057236321777e-06, "loss": 1.4442, "step": 24620 }, { "epoch": 3.497019585580471, "grad_norm": 3.5757498741149902, "learning_rate": 7.4576463808394025e-06, "loss": 1.3953, "step": 24640 }, { "epoch": 3.499858075503832, "grad_norm": 3.57453989982605, "learning_rate": 7.44324116776957e-06, "loss": 1.4206, "step": 24660 }, { "epoch": 3.502696565427193, "grad_norm": 3.3557260036468506, "learning_rate": 7.42884162908272e-06, "loss": 1.4638, "step": 24680 }, { "epoch": 3.5055350553505535, "grad_norm": 3.568833351135254, "learning_rate": 7.414447796736688e-06, "loss": 1.4891, "step": 24700 }, { "epoch": 3.508373545273914, "grad_norm": 3.3239681720733643, "learning_rate": 7.400059702676661e-06, "loss": 1.4199, "step": 24720 }, { "epoch": 3.511212035197275, "grad_norm": 3.4687602519989014, "learning_rate": 7.385677378835072e-06, "loss": 1.419, "step": 24740 }, { "epoch": 3.514050525120636, "grad_norm": 3.4170444011688232, "learning_rate": 7.371300857131564e-06, "loss": 1.4173, "step": 24760 }, { "epoch": 3.5168890150439966, "grad_norm": 3.4044437408447266, "learning_rate": 7.3569301694729e-06, "loss": 1.4438, "step": 24780 }, { "epoch": 3.5197275049673573, "grad_norm": 3.421126365661621, "learning_rate": 7.342565347752885e-06, "loss": 1.4566, "step": 24800 }, { "epoch": 3.522565994890718, "grad_norm": 3.6022894382476807, "learning_rate": 7.328206423852317e-06, "loss": 1.3959, "step": 24820 }, { "epoch": 3.5254044848140786, "grad_norm": 3.1480531692504883, "learning_rate": 7.313853429638898e-06, "loss": 1.4076, "step": 24840 }, { "epoch": 3.5282429747374398, "grad_norm": 3.455979347229004, "learning_rate": 7.299506396967173e-06, "loss": 1.401, "step": 24860 }, { "epoch": 3.5310814646608004, "grad_norm": 3.650588274002075, "learning_rate": 7.285165357678456e-06, "loss": 1.4375, "step": 24880 }, { "epoch": 3.533919954584161, "grad_norm": 3.5019893646240234, "learning_rate": 7.270830343600752e-06, "loss": 1.4403, "step": 24900 }, { "epoch": 3.536758444507522, "grad_norm": 3.6494686603546143, "learning_rate": 7.256501386548704e-06, "loss": 1.4617, "step": 24920 }, { "epoch": 3.539596934430883, "grad_norm": 3.340595006942749, "learning_rate": 7.242178518323515e-06, "loss": 1.4248, "step": 24940 }, { "epoch": 3.5424354243542435, "grad_norm": 3.3635270595550537, "learning_rate": 7.227861770712859e-06, "loss": 1.4142, "step": 24960 }, { "epoch": 3.545273914277604, "grad_norm": 3.3856077194213867, "learning_rate": 7.213551175490836e-06, "loss": 1.3947, "step": 24980 }, { "epoch": 3.548112404200965, "grad_norm": 3.0921993255615234, "learning_rate": 7.199246764417898e-06, "loss": 1.4074, "step": 25000 }, { "epoch": 3.550950894124326, "grad_norm": 3.50829815864563, "learning_rate": 7.184948569240757e-06, "loss": 1.4113, "step": 25020 }, { "epoch": 3.5537893840476866, "grad_norm": 3.2148396968841553, "learning_rate": 7.170656621692344e-06, "loss": 1.4349, "step": 25040 }, { "epoch": 3.5566278739710473, "grad_norm": 3.254676103591919, "learning_rate": 7.156370953491716e-06, "loss": 1.4644, "step": 25060 }, { "epoch": 3.5594663638944084, "grad_norm": 3.254268169403076, "learning_rate": 7.142091596343992e-06, "loss": 1.3922, "step": 25080 }, { "epoch": 3.562304853817769, "grad_norm": 3.3635122776031494, "learning_rate": 7.127818581940293e-06, "loss": 1.4579, "step": 25100 }, { "epoch": 3.5651433437411297, "grad_norm": 3.367809295654297, "learning_rate": 7.113551941957653e-06, "loss": 1.4482, "step": 25120 }, { "epoch": 3.5679818336644904, "grad_norm": 3.5001018047332764, "learning_rate": 7.099291708058966e-06, "loss": 1.4133, "step": 25140 }, { "epoch": 3.570820323587851, "grad_norm": 3.3876442909240723, "learning_rate": 7.085037911892913e-06, "loss": 1.4105, "step": 25160 }, { "epoch": 3.573658813511212, "grad_norm": 3.4363443851470947, "learning_rate": 7.070790585093873e-06, "loss": 1.412, "step": 25180 }, { "epoch": 3.576497303434573, "grad_norm": 3.470229387283325, "learning_rate": 7.056549759281876e-06, "loss": 1.4073, "step": 25200 }, { "epoch": 3.5793357933579335, "grad_norm": 3.189147710800171, "learning_rate": 7.042315466062532e-06, "loss": 1.424, "step": 25220 }, { "epoch": 3.5821742832812946, "grad_norm": 3.729313373565674, "learning_rate": 7.028087737026929e-06, "loss": 1.463, "step": 25240 }, { "epoch": 3.5850127732046553, "grad_norm": 3.2495806217193604, "learning_rate": 7.013866603751616e-06, "loss": 1.4363, "step": 25260 }, { "epoch": 3.587851263128016, "grad_norm": 3.4398090839385986, "learning_rate": 6.9996520977984804e-06, "loss": 1.3972, "step": 25280 }, { "epoch": 3.5906897530513766, "grad_norm": 3.1955244541168213, "learning_rate": 6.985444250714711e-06, "loss": 1.4532, "step": 25300 }, { "epoch": 3.5935282429747373, "grad_norm": 3.364966630935669, "learning_rate": 6.971243094032719e-06, "loss": 1.4573, "step": 25320 }, { "epoch": 3.5963667328980984, "grad_norm": 3.6411001682281494, "learning_rate": 6.957048659270061e-06, "loss": 1.4226, "step": 25340 }, { "epoch": 3.599205222821459, "grad_norm": 3.258669137954712, "learning_rate": 6.942860977929382e-06, "loss": 1.3919, "step": 25360 }, { "epoch": 3.6020437127448197, "grad_norm": 3.462082624435425, "learning_rate": 6.928680081498342e-06, "loss": 1.4236, "step": 25380 }, { "epoch": 3.6048822026681804, "grad_norm": 3.5179169178009033, "learning_rate": 6.914506001449526e-06, "loss": 1.3964, "step": 25400 }, { "epoch": 3.607720692591541, "grad_norm": 3.3972086906433105, "learning_rate": 6.900338769240411e-06, "loss": 1.4646, "step": 25420 }, { "epoch": 3.610559182514902, "grad_norm": 3.6233134269714355, "learning_rate": 6.88617841631326e-06, "loss": 1.4451, "step": 25440 }, { "epoch": 3.613397672438263, "grad_norm": 3.456287384033203, "learning_rate": 6.872024974095079e-06, "loss": 1.4734, "step": 25460 }, { "epoch": 3.6162361623616235, "grad_norm": 3.5421485900878906, "learning_rate": 6.857878473997532e-06, "loss": 1.4421, "step": 25480 }, { "epoch": 3.6190746522849846, "grad_norm": 3.1443164348602295, "learning_rate": 6.843738947416877e-06, "loss": 1.4129, "step": 25500 }, { "epoch": 3.6219131422083453, "grad_norm": 3.311699867248535, "learning_rate": 6.829606425733892e-06, "loss": 1.4386, "step": 25520 }, { "epoch": 3.624751632131706, "grad_norm": 3.3542139530181885, "learning_rate": 6.81548094031382e-06, "loss": 1.4516, "step": 25540 }, { "epoch": 3.6275901220550666, "grad_norm": 3.480483293533325, "learning_rate": 6.80136252250627e-06, "loss": 1.4577, "step": 25560 }, { "epoch": 3.6304286119784273, "grad_norm": 3.463317632675171, "learning_rate": 6.787251203645178e-06, "loss": 1.4186, "step": 25580 }, { "epoch": 3.6332671019017884, "grad_norm": 3.725294589996338, "learning_rate": 6.773147015048725e-06, "loss": 1.4485, "step": 25600 }, { "epoch": 3.636105591825149, "grad_norm": 3.4837920665740967, "learning_rate": 6.7590499880192595e-06, "loss": 1.4132, "step": 25620 }, { "epoch": 3.6389440817485097, "grad_norm": 3.1828911304473877, "learning_rate": 6.7449601538432454e-06, "loss": 1.3954, "step": 25640 }, { "epoch": 3.641782571671871, "grad_norm": 3.573230028152466, "learning_rate": 6.7308775437911726e-06, "loss": 1.4488, "step": 25660 }, { "epoch": 3.6446210615952315, "grad_norm": 3.3434858322143555, "learning_rate": 6.716802189117506e-06, "loss": 1.4201, "step": 25680 }, { "epoch": 3.647459551518592, "grad_norm": 3.1655311584472656, "learning_rate": 6.7027341210606075e-06, "loss": 1.4638, "step": 25700 }, { "epoch": 3.650298041441953, "grad_norm": 3.459294080734253, "learning_rate": 6.6886733708426646e-06, "loss": 1.4215, "step": 25720 }, { "epoch": 3.6531365313653135, "grad_norm": 3.5447676181793213, "learning_rate": 6.67461996966962e-06, "loss": 1.4694, "step": 25740 }, { "epoch": 3.6559750212886746, "grad_norm": 3.3586156368255615, "learning_rate": 6.660573948731119e-06, "loss": 1.4481, "step": 25760 }, { "epoch": 3.6588135112120352, "grad_norm": 3.3928849697113037, "learning_rate": 6.646535339200412e-06, "loss": 1.4421, "step": 25780 }, { "epoch": 3.661652001135396, "grad_norm": 3.326345443725586, "learning_rate": 6.632504172234314e-06, "loss": 1.4349, "step": 25800 }, { "epoch": 3.664490491058757, "grad_norm": 3.33632755279541, "learning_rate": 6.61848047897311e-06, "loss": 1.464, "step": 25820 }, { "epoch": 3.6673289809821172, "grad_norm": 3.322783946990967, "learning_rate": 6.6044642905405086e-06, "loss": 1.4534, "step": 25840 }, { "epoch": 3.6701674709054783, "grad_norm": 3.4152019023895264, "learning_rate": 6.590455638043562e-06, "loss": 1.4438, "step": 25860 }, { "epoch": 3.673005960828839, "grad_norm": 3.3580520153045654, "learning_rate": 6.576454552572587e-06, "loss": 1.4092, "step": 25880 }, { "epoch": 3.6758444507521997, "grad_norm": 3.541163444519043, "learning_rate": 6.562461065201115e-06, "loss": 1.4087, "step": 25900 }, { "epoch": 3.678682940675561, "grad_norm": 3.3519675731658936, "learning_rate": 6.5484752069858186e-06, "loss": 1.4142, "step": 25920 }, { "epoch": 3.6815214305989215, "grad_norm": 3.4979493618011475, "learning_rate": 6.5344970089664206e-06, "loss": 1.4397, "step": 25940 }, { "epoch": 3.684359920522282, "grad_norm": 3.6403958797454834, "learning_rate": 6.520526502165666e-06, "loss": 1.4584, "step": 25960 }, { "epoch": 3.687198410445643, "grad_norm": 3.2847230434417725, "learning_rate": 6.506563717589211e-06, "loss": 1.4198, "step": 25980 }, { "epoch": 3.6900369003690034, "grad_norm": 3.3911709785461426, "learning_rate": 6.492608686225584e-06, "loss": 1.4323, "step": 26000 }, { "epoch": 3.6928753902923646, "grad_norm": 3.3148529529571533, "learning_rate": 6.47866143904611e-06, "loss": 1.4425, "step": 26020 }, { "epoch": 3.6957138802157252, "grad_norm": 3.437150478363037, "learning_rate": 6.464722007004822e-06, "loss": 1.4545, "step": 26040 }, { "epoch": 3.698552370139086, "grad_norm": 3.4192233085632324, "learning_rate": 6.450790421038422e-06, "loss": 1.4391, "step": 26060 }, { "epoch": 3.701390860062447, "grad_norm": 3.450601816177368, "learning_rate": 6.4368667120661995e-06, "loss": 1.437, "step": 26080 }, { "epoch": 3.7042293499858077, "grad_norm": 3.4797775745391846, "learning_rate": 6.422950910989951e-06, "loss": 1.4383, "step": 26100 }, { "epoch": 3.7070678399091683, "grad_norm": 3.3390986919403076, "learning_rate": 6.409043048693928e-06, "loss": 1.4248, "step": 26120 }, { "epoch": 3.709906329832529, "grad_norm": 3.4437825679779053, "learning_rate": 6.395143156044771e-06, "loss": 1.4321, "step": 26140 }, { "epoch": 3.7127448197558897, "grad_norm": 3.383654832839966, "learning_rate": 6.381251263891417e-06, "loss": 1.441, "step": 26160 }, { "epoch": 3.7155833096792508, "grad_norm": 3.2234272956848145, "learning_rate": 6.367367403065064e-06, "loss": 1.4187, "step": 26180 }, { "epoch": 3.7184217996026114, "grad_norm": 3.550886631011963, "learning_rate": 6.3534916043790674e-06, "loss": 1.4324, "step": 26200 }, { "epoch": 3.721260289525972, "grad_norm": 3.500889778137207, "learning_rate": 6.3396238986289045e-06, "loss": 1.4007, "step": 26220 }, { "epoch": 3.724098779449333, "grad_norm": 3.3703832626342773, "learning_rate": 6.325764316592092e-06, "loss": 1.4622, "step": 26240 }, { "epoch": 3.726937269372694, "grad_norm": 3.366621255874634, "learning_rate": 6.311912889028102e-06, "loss": 1.4277, "step": 26260 }, { "epoch": 3.7297757592960545, "grad_norm": 3.569024085998535, "learning_rate": 6.298069646678324e-06, "loss": 1.4318, "step": 26280 }, { "epoch": 3.732614249219415, "grad_norm": 3.4911625385284424, "learning_rate": 6.284234620265977e-06, "loss": 1.4185, "step": 26300 }, { "epoch": 3.735452739142776, "grad_norm": 3.4030697345733643, "learning_rate": 6.270407840496043e-06, "loss": 1.4163, "step": 26320 }, { "epoch": 3.738291229066137, "grad_norm": 3.4648075103759766, "learning_rate": 6.256589338055202e-06, "loss": 1.4061, "step": 26340 }, { "epoch": 3.7411297189894976, "grad_norm": 3.3087997436523438, "learning_rate": 6.242779143611764e-06, "loss": 1.4065, "step": 26360 }, { "epoch": 3.7439682089128583, "grad_norm": 3.549787998199463, "learning_rate": 6.228977287815603e-06, "loss": 1.4102, "step": 26380 }, { "epoch": 3.746806698836219, "grad_norm": 3.5618906021118164, "learning_rate": 6.215183801298089e-06, "loss": 1.4503, "step": 26400 }, { "epoch": 3.7496451887595796, "grad_norm": 3.3897016048431396, "learning_rate": 6.201398714672007e-06, "loss": 1.3617, "step": 26420 }, { "epoch": 3.7524836786829407, "grad_norm": 3.650221109390259, "learning_rate": 6.187622058531507e-06, "loss": 1.4385, "step": 26440 }, { "epoch": 3.7553221686063014, "grad_norm": 3.438333511352539, "learning_rate": 6.173853863452035e-06, "loss": 1.4343, "step": 26460 }, { "epoch": 3.758160658529662, "grad_norm": 3.4128596782684326, "learning_rate": 6.160094159990243e-06, "loss": 1.4482, "step": 26480 }, { "epoch": 3.760999148453023, "grad_norm": 3.5222368240356445, "learning_rate": 6.146342978683953e-06, "loss": 1.4264, "step": 26500 }, { "epoch": 3.763837638376384, "grad_norm": 3.2998077869415283, "learning_rate": 6.132600350052058e-06, "loss": 1.3982, "step": 26520 }, { "epoch": 3.7666761282997445, "grad_norm": 3.2709522247314453, "learning_rate": 6.118866304594482e-06, "loss": 1.4631, "step": 26540 }, { "epoch": 3.769514618223105, "grad_norm": 3.4011142253875732, "learning_rate": 6.105140872792095e-06, "loss": 1.3915, "step": 26560 }, { "epoch": 3.772353108146466, "grad_norm": 3.3245699405670166, "learning_rate": 6.091424085106649e-06, "loss": 1.4237, "step": 26580 }, { "epoch": 3.775191598069827, "grad_norm": 3.4566919803619385, "learning_rate": 6.077715971980711e-06, "loss": 1.4734, "step": 26600 }, { "epoch": 3.7780300879931876, "grad_norm": 3.5840368270874023, "learning_rate": 6.064016563837604e-06, "loss": 1.426, "step": 26620 }, { "epoch": 3.7808685779165483, "grad_norm": 3.400447368621826, "learning_rate": 6.050325891081316e-06, "loss": 1.4345, "step": 26640 }, { "epoch": 3.7837070678399094, "grad_norm": 3.3948938846588135, "learning_rate": 6.036643984096464e-06, "loss": 1.4507, "step": 26660 }, { "epoch": 3.78654555776327, "grad_norm": 3.3893017768859863, "learning_rate": 6.022970873248206e-06, "loss": 1.4635, "step": 26680 }, { "epoch": 3.7893840476866307, "grad_norm": 3.5240049362182617, "learning_rate": 6.009306588882167e-06, "loss": 1.3982, "step": 26700 }, { "epoch": 3.7922225376099914, "grad_norm": 3.458639144897461, "learning_rate": 5.9956511613244e-06, "loss": 1.4204, "step": 26720 }, { "epoch": 3.795061027533352, "grad_norm": 3.539504289627075, "learning_rate": 5.982004620881288e-06, "loss": 1.4381, "step": 26740 }, { "epoch": 3.797899517456713, "grad_norm": 3.391981601715088, "learning_rate": 5.968366997839498e-06, "loss": 1.4314, "step": 26760 }, { "epoch": 3.800738007380074, "grad_norm": 3.5270636081695557, "learning_rate": 5.9547383224659035e-06, "loss": 1.4638, "step": 26780 }, { "epoch": 3.8035764973034345, "grad_norm": 3.391427993774414, "learning_rate": 5.941118625007516e-06, "loss": 1.4433, "step": 26800 }, { "epoch": 3.8064149872267956, "grad_norm": 3.2816219329833984, "learning_rate": 5.92750793569143e-06, "loss": 1.4122, "step": 26820 }, { "epoch": 3.8092534771501563, "grad_norm": 3.409456729888916, "learning_rate": 5.913906284724744e-06, "loss": 1.4562, "step": 26840 }, { "epoch": 3.812091967073517, "grad_norm": 3.552630662918091, "learning_rate": 5.90031370229449e-06, "loss": 1.4489, "step": 26860 }, { "epoch": 3.8149304569968776, "grad_norm": 3.6890652179718018, "learning_rate": 5.886730218567589e-06, "loss": 1.4325, "step": 26880 }, { "epoch": 3.8177689469202383, "grad_norm": 3.3528244495391846, "learning_rate": 5.873155863690749e-06, "loss": 1.4125, "step": 26900 }, { "epoch": 3.8206074368435994, "grad_norm": 3.257086992263794, "learning_rate": 5.859590667790436e-06, "loss": 1.4211, "step": 26920 }, { "epoch": 3.82344592676696, "grad_norm": 3.308529853820801, "learning_rate": 5.8460346609727805e-06, "loss": 1.4189, "step": 26940 }, { "epoch": 3.8262844166903207, "grad_norm": 3.3572146892547607, "learning_rate": 5.832487873323515e-06, "loss": 1.4519, "step": 26960 }, { "epoch": 3.8291229066136814, "grad_norm": 3.485232353210449, "learning_rate": 5.81895033490792e-06, "loss": 1.4313, "step": 26980 }, { "epoch": 3.831961396537042, "grad_norm": 3.4398555755615234, "learning_rate": 5.805422075770746e-06, "loss": 1.4321, "step": 27000 }, { "epoch": 3.834799886460403, "grad_norm": 3.266847610473633, "learning_rate": 5.791903125936142e-06, "loss": 1.4275, "step": 27020 }, { "epoch": 3.837638376383764, "grad_norm": 3.3655261993408203, "learning_rate": 5.77839351540761e-06, "loss": 1.422, "step": 27040 }, { "epoch": 3.8404768663071245, "grad_norm": 3.3943629264831543, "learning_rate": 5.7648932741679095e-06, "loss": 1.4535, "step": 27060 }, { "epoch": 3.8433153562304856, "grad_norm": 3.4835948944091797, "learning_rate": 5.751402432179019e-06, "loss": 1.4041, "step": 27080 }, { "epoch": 3.8461538461538463, "grad_norm": 3.226762533187866, "learning_rate": 5.7379210193820464e-06, "loss": 1.4594, "step": 27100 }, { "epoch": 3.848992336077207, "grad_norm": 3.2850000858306885, "learning_rate": 5.724449065697182e-06, "loss": 1.4382, "step": 27120 }, { "epoch": 3.8518308260005676, "grad_norm": 3.047348976135254, "learning_rate": 5.710986601023611e-06, "loss": 1.4387, "step": 27140 }, { "epoch": 3.8546693159239283, "grad_norm": 3.5676119327545166, "learning_rate": 5.69753365523948e-06, "loss": 1.4415, "step": 27160 }, { "epoch": 3.8575078058472894, "grad_norm": 3.1339187622070312, "learning_rate": 5.68409025820178e-06, "loss": 1.4428, "step": 27180 }, { "epoch": 3.86034629577065, "grad_norm": 3.625317096710205, "learning_rate": 5.670656439746338e-06, "loss": 1.429, "step": 27200 }, { "epoch": 3.8631847856940107, "grad_norm": 3.3884177207946777, "learning_rate": 5.657232229687706e-06, "loss": 1.4229, "step": 27220 }, { "epoch": 3.866023275617372, "grad_norm": 3.5102763175964355, "learning_rate": 5.6438176578191165e-06, "loss": 1.4522, "step": 27240 }, { "epoch": 3.8688617655407325, "grad_norm": 3.3887314796447754, "learning_rate": 5.63041275391241e-06, "loss": 1.423, "step": 27260 }, { "epoch": 3.871700255464093, "grad_norm": 3.5343093872070312, "learning_rate": 5.617017547717971e-06, "loss": 1.445, "step": 27280 }, { "epoch": 3.874538745387454, "grad_norm": 3.289602041244507, "learning_rate": 5.603632068964664e-06, "loss": 1.4522, "step": 27300 }, { "epoch": 3.8773772353108145, "grad_norm": 3.33013653755188, "learning_rate": 5.590256347359758e-06, "loss": 1.4198, "step": 27320 }, { "epoch": 3.8802157252341756, "grad_norm": 3.4199821949005127, "learning_rate": 5.5768904125888745e-06, "loss": 1.4175, "step": 27340 }, { "epoch": 3.8830542151575362, "grad_norm": 3.223597526550293, "learning_rate": 5.563534294315906e-06, "loss": 1.3887, "step": 27360 }, { "epoch": 3.885892705080897, "grad_norm": 3.2658143043518066, "learning_rate": 5.550188022182976e-06, "loss": 1.4174, "step": 27380 }, { "epoch": 3.888731195004258, "grad_norm": 3.41963529586792, "learning_rate": 5.536851625810329e-06, "loss": 1.4771, "step": 27400 }, { "epoch": 3.8915696849276182, "grad_norm": 3.401566505432129, "learning_rate": 5.523525134796321e-06, "loss": 1.4326, "step": 27420 }, { "epoch": 3.8944081748509793, "grad_norm": 3.3584539890289307, "learning_rate": 5.5102085787172935e-06, "loss": 1.4218, "step": 27440 }, { "epoch": 3.89724666477434, "grad_norm": 3.5216176509857178, "learning_rate": 5.496901987127568e-06, "loss": 1.48, "step": 27460 }, { "epoch": 3.9000851546977007, "grad_norm": 3.652277708053589, "learning_rate": 5.4836053895593326e-06, "loss": 1.4634, "step": 27480 }, { "epoch": 3.902923644621062, "grad_norm": 3.2207040786743164, "learning_rate": 5.470318815522601e-06, "loss": 1.4061, "step": 27500 }, { "epoch": 3.9057621345444224, "grad_norm": 3.524167776107788, "learning_rate": 5.4570422945051416e-06, "loss": 1.4441, "step": 27520 }, { "epoch": 3.908600624467783, "grad_norm": 3.4355928897857666, "learning_rate": 5.443775855972409e-06, "loss": 1.4013, "step": 27540 }, { "epoch": 3.911439114391144, "grad_norm": 3.490790367126465, "learning_rate": 5.430519529367484e-06, "loss": 1.4387, "step": 27560 }, { "epoch": 3.9142776043145044, "grad_norm": 3.789229393005371, "learning_rate": 5.4172733441110025e-06, "loss": 1.4385, "step": 27580 }, { "epoch": 3.9171160942378656, "grad_norm": 3.617640972137451, "learning_rate": 5.4040373296010966e-06, "loss": 1.4183, "step": 27600 }, { "epoch": 3.919954584161226, "grad_norm": 3.302490711212158, "learning_rate": 5.3908115152133164e-06, "loss": 1.4229, "step": 27620 }, { "epoch": 3.922793074084587, "grad_norm": 3.3983263969421387, "learning_rate": 5.3775959303005945e-06, "loss": 1.4526, "step": 27640 }, { "epoch": 3.925631564007948, "grad_norm": 3.2773683071136475, "learning_rate": 5.364390604193133e-06, "loss": 1.3964, "step": 27660 }, { "epoch": 3.9284700539313087, "grad_norm": 3.3874120712280273, "learning_rate": 5.3511955661983894e-06, "loss": 1.4344, "step": 27680 }, { "epoch": 3.9313085438546693, "grad_norm": 3.446448564529419, "learning_rate": 5.338010845600978e-06, "loss": 1.4203, "step": 27700 }, { "epoch": 3.93414703377803, "grad_norm": 3.4998583793640137, "learning_rate": 5.324836471662614e-06, "loss": 1.45, "step": 27720 }, { "epoch": 3.9369855237013907, "grad_norm": 3.3228752613067627, "learning_rate": 5.311672473622052e-06, "loss": 1.4357, "step": 27740 }, { "epoch": 3.9398240136247518, "grad_norm": 3.5515880584716797, "learning_rate": 5.29851888069502e-06, "loss": 1.4545, "step": 27760 }, { "epoch": 3.9426625035481124, "grad_norm": 3.2263689041137695, "learning_rate": 5.285375722074151e-06, "loss": 1.454, "step": 27780 }, { "epoch": 3.945500993471473, "grad_norm": 3.333953380584717, "learning_rate": 5.272243026928919e-06, "loss": 1.452, "step": 27800 }, { "epoch": 3.948339483394834, "grad_norm": 3.3280673027038574, "learning_rate": 5.25912082440558e-06, "loss": 1.4747, "step": 27820 }, { "epoch": 3.951177973318195, "grad_norm": 3.2877233028411865, "learning_rate": 5.246009143627097e-06, "loss": 1.4642, "step": 27840 }, { "epoch": 3.9540164632415555, "grad_norm": 3.3526668548583984, "learning_rate": 5.2329080136930945e-06, "loss": 1.4198, "step": 27860 }, { "epoch": 3.956854953164916, "grad_norm": 3.542003870010376, "learning_rate": 5.219817463679759e-06, "loss": 1.3749, "step": 27880 }, { "epoch": 3.959693443088277, "grad_norm": 3.619995355606079, "learning_rate": 5.206737522639817e-06, "loss": 1.4632, "step": 27900 }, { "epoch": 3.962531933011638, "grad_norm": 3.346191644668579, "learning_rate": 5.193668219602444e-06, "loss": 1.4086, "step": 27920 }, { "epoch": 3.9653704229349986, "grad_norm": 3.4477336406707764, "learning_rate": 5.180609583573195e-06, "loss": 1.4773, "step": 27940 }, { "epoch": 3.9682089128583593, "grad_norm": 3.1747334003448486, "learning_rate": 5.167561643533969e-06, "loss": 1.4329, "step": 27960 }, { "epoch": 3.97104740278172, "grad_norm": 3.616567611694336, "learning_rate": 5.154524428442906e-06, "loss": 1.4313, "step": 27980 }, { "epoch": 3.9738858927050806, "grad_norm": 3.487492084503174, "learning_rate": 5.141497967234364e-06, "loss": 1.4344, "step": 28000 }, { "epoch": 3.9767243826284417, "grad_norm": 3.5283517837524414, "learning_rate": 5.128482288818821e-06, "loss": 1.4275, "step": 28020 }, { "epoch": 3.9795628725518024, "grad_norm": 3.210801124572754, "learning_rate": 5.115477422082829e-06, "loss": 1.4728, "step": 28040 }, { "epoch": 3.982401362475163, "grad_norm": 3.3055663108825684, "learning_rate": 5.102483395888937e-06, "loss": 1.4264, "step": 28060 }, { "epoch": 3.985239852398524, "grad_norm": 3.567871570587158, "learning_rate": 5.089500239075658e-06, "loss": 1.4314, "step": 28080 }, { "epoch": 3.988078342321885, "grad_norm": 3.550602436065674, "learning_rate": 5.076527980457348e-06, "loss": 1.3998, "step": 28100 }, { "epoch": 3.9909168322452455, "grad_norm": 3.5756328105926514, "learning_rate": 5.063566648824207e-06, "loss": 1.4388, "step": 28120 }, { "epoch": 3.993755322168606, "grad_norm": 3.139738082885742, "learning_rate": 5.050616272942157e-06, "loss": 1.4237, "step": 28140 }, { "epoch": 3.996593812091967, "grad_norm": 3.489283561706543, "learning_rate": 5.037676881552831e-06, "loss": 1.4358, "step": 28160 }, { "epoch": 3.999432302015328, "grad_norm": 3.491947650909424, "learning_rate": 5.0247485033734685e-06, "loss": 1.4341, "step": 28180 }, { "epoch": 4.002270791938688, "grad_norm": 3.8061912059783936, "learning_rate": 5.01183116709686e-06, "loss": 1.2584, "step": 28200 }, { "epoch": 4.005109281862049, "grad_norm": 3.5535686016082764, "learning_rate": 4.99892490139131e-06, "loss": 1.1982, "step": 28220 }, { "epoch": 4.00794777178541, "grad_norm": 3.7377846240997314, "learning_rate": 4.986029734900538e-06, "loss": 1.1895, "step": 28240 }, { "epoch": 4.010786261708771, "grad_norm": 3.8872344493865967, "learning_rate": 4.973145696243635e-06, "loss": 1.218, "step": 28260 }, { "epoch": 4.013624751632132, "grad_norm": 3.9284212589263916, "learning_rate": 4.960272814014992e-06, "loss": 1.2243, "step": 28280 }, { "epoch": 4.016463241555493, "grad_norm": 3.9064342975616455, "learning_rate": 4.947411116784255e-06, "loss": 1.1838, "step": 28300 }, { "epoch": 4.019301731478853, "grad_norm": 3.7421724796295166, "learning_rate": 4.934560633096216e-06, "loss": 1.202, "step": 28320 }, { "epoch": 4.022140221402214, "grad_norm": 4.055378437042236, "learning_rate": 4.921721391470817e-06, "loss": 1.1844, "step": 28340 }, { "epoch": 4.024978711325574, "grad_norm": 3.790045976638794, "learning_rate": 4.908893420403018e-06, "loss": 1.1658, "step": 28360 }, { "epoch": 4.0278172012489355, "grad_norm": 4.107481002807617, "learning_rate": 4.896076748362778e-06, "loss": 1.2053, "step": 28380 }, { "epoch": 4.030655691172297, "grad_norm": 4.524267196655273, "learning_rate": 4.883271403794994e-06, "loss": 1.2183, "step": 28400 }, { "epoch": 4.033494181095657, "grad_norm": 3.988011360168457, "learning_rate": 4.870477415119391e-06, "loss": 1.1849, "step": 28420 }, { "epoch": 4.036332671019018, "grad_norm": 3.663440704345703, "learning_rate": 4.857694810730522e-06, "loss": 1.1928, "step": 28440 }, { "epoch": 4.039171160942379, "grad_norm": 4.130860805511475, "learning_rate": 4.8449236189976565e-06, "loss": 1.2509, "step": 28460 }, { "epoch": 4.042009650865739, "grad_norm": 4.277409076690674, "learning_rate": 4.832163868264742e-06, "loss": 1.25, "step": 28480 }, { "epoch": 4.0448481407891, "grad_norm": 3.820749044418335, "learning_rate": 4.819415586850331e-06, "loss": 1.1783, "step": 28500 }, { "epoch": 4.047686630712461, "grad_norm": 3.8916690349578857, "learning_rate": 4.806678803047523e-06, "loss": 1.1715, "step": 28520 }, { "epoch": 4.050525120635822, "grad_norm": 4.130290985107422, "learning_rate": 4.793953545123899e-06, "loss": 1.2318, "step": 28540 }, { "epoch": 4.053363610559183, "grad_norm": 4.118926525115967, "learning_rate": 4.78123984132146e-06, "loss": 1.1841, "step": 28560 }, { "epoch": 4.056202100482543, "grad_norm": 4.280374050140381, "learning_rate": 4.7685377198565675e-06, "loss": 1.1707, "step": 28580 }, { "epoch": 4.059040590405904, "grad_norm": 4.145346641540527, "learning_rate": 4.755847208919868e-06, "loss": 1.2109, "step": 28600 }, { "epoch": 4.061879080329265, "grad_norm": 4.196761131286621, "learning_rate": 4.74316833667626e-06, "loss": 1.178, "step": 28620 }, { "epoch": 4.0647175702526255, "grad_norm": 4.167775630950928, "learning_rate": 4.730501131264782e-06, "loss": 1.2161, "step": 28640 }, { "epoch": 4.067556060175987, "grad_norm": 3.8826279640197754, "learning_rate": 4.717845620798612e-06, "loss": 1.2207, "step": 28660 }, { "epoch": 4.070394550099347, "grad_norm": 4.0560808181762695, "learning_rate": 4.70520183336494e-06, "loss": 1.168, "step": 28680 }, { "epoch": 4.073233040022708, "grad_norm": 4.231751441955566, "learning_rate": 4.69256979702497e-06, "loss": 1.1941, "step": 28700 }, { "epoch": 4.076071529946069, "grad_norm": 4.008339881896973, "learning_rate": 4.6799495398138006e-06, "loss": 1.1991, "step": 28720 }, { "epoch": 4.078910019869429, "grad_norm": 4.00504732131958, "learning_rate": 4.667341089740402e-06, "loss": 1.2058, "step": 28740 }, { "epoch": 4.08174850979279, "grad_norm": 3.9676270484924316, "learning_rate": 4.654744474787538e-06, "loss": 1.2271, "step": 28760 }, { "epoch": 4.084586999716151, "grad_norm": 3.994428873062134, "learning_rate": 4.642159722911702e-06, "loss": 1.2068, "step": 28780 }, { "epoch": 4.087425489639512, "grad_norm": 4.257818698883057, "learning_rate": 4.629586862043062e-06, "loss": 1.2184, "step": 28800 }, { "epoch": 4.090263979562873, "grad_norm": 4.114505767822266, "learning_rate": 4.617025920085389e-06, "loss": 1.1941, "step": 28820 }, { "epoch": 4.093102469486233, "grad_norm": 4.380485534667969, "learning_rate": 4.604476924916022e-06, "loss": 1.181, "step": 28840 }, { "epoch": 4.095940959409594, "grad_norm": 4.10575008392334, "learning_rate": 4.591939904385755e-06, "loss": 1.1781, "step": 28860 }, { "epoch": 4.098779449332955, "grad_norm": 4.187893390655518, "learning_rate": 4.579414886318836e-06, "loss": 1.198, "step": 28880 }, { "epoch": 4.1016179392563155, "grad_norm": 4.231674671173096, "learning_rate": 4.566901898512846e-06, "loss": 1.2107, "step": 28900 }, { "epoch": 4.104456429179677, "grad_norm": 3.669097661972046, "learning_rate": 4.554400968738693e-06, "loss": 1.1837, "step": 28920 }, { "epoch": 4.107294919103037, "grad_norm": 4.04845666885376, "learning_rate": 4.541912124740509e-06, "loss": 1.2081, "step": 28940 }, { "epoch": 4.110133409026398, "grad_norm": 4.222532272338867, "learning_rate": 4.529435394235607e-06, "loss": 1.2277, "step": 28960 }, { "epoch": 4.112971898949759, "grad_norm": 4.208317279815674, "learning_rate": 4.516970804914416e-06, "loss": 1.1915, "step": 28980 }, { "epoch": 4.115810388873119, "grad_norm": 3.817049026489258, "learning_rate": 4.504518384440416e-06, "loss": 1.1859, "step": 29000 }, { "epoch": 4.11864887879648, "grad_norm": 4.376369953155518, "learning_rate": 4.492078160450084e-06, "loss": 1.2211, "step": 29020 }, { "epoch": 4.1214873687198414, "grad_norm": 4.0470662117004395, "learning_rate": 4.479650160552827e-06, "loss": 1.1866, "step": 29040 }, { "epoch": 4.124325858643202, "grad_norm": 4.205740451812744, "learning_rate": 4.467234412330924e-06, "loss": 1.2215, "step": 29060 }, { "epoch": 4.127164348566563, "grad_norm": 4.047322750091553, "learning_rate": 4.454830943339455e-06, "loss": 1.2235, "step": 29080 }, { "epoch": 4.130002838489923, "grad_norm": 4.262022972106934, "learning_rate": 4.442439781106265e-06, "loss": 1.1911, "step": 29100 }, { "epoch": 4.132841328413284, "grad_norm": 4.065189361572266, "learning_rate": 4.430060953131863e-06, "loss": 1.2142, "step": 29120 }, { "epoch": 4.135679818336645, "grad_norm": 4.165289878845215, "learning_rate": 4.417694486889405e-06, "loss": 1.2309, "step": 29140 }, { "epoch": 4.138518308260005, "grad_norm": 4.113856792449951, "learning_rate": 4.405340409824599e-06, "loss": 1.2055, "step": 29160 }, { "epoch": 4.1413567981833665, "grad_norm": 4.153496742248535, "learning_rate": 4.39299874935566e-06, "loss": 1.2549, "step": 29180 }, { "epoch": 4.144195288106728, "grad_norm": 4.374662399291992, "learning_rate": 4.380669532873249e-06, "loss": 1.1536, "step": 29200 }, { "epoch": 4.147033778030088, "grad_norm": 3.5948405265808105, "learning_rate": 4.3683527877404045e-06, "loss": 1.1745, "step": 29220 }, { "epoch": 4.149872267953449, "grad_norm": 3.863344669342041, "learning_rate": 4.356048541292489e-06, "loss": 1.1979, "step": 29240 }, { "epoch": 4.152710757876809, "grad_norm": 4.134788990020752, "learning_rate": 4.343756820837126e-06, "loss": 1.1873, "step": 29260 }, { "epoch": 4.15554924780017, "grad_norm": 3.9534366130828857, "learning_rate": 4.331477653654139e-06, "loss": 1.1967, "step": 29280 }, { "epoch": 4.158387737723531, "grad_norm": 3.931607246398926, "learning_rate": 4.3192110669954875e-06, "loss": 1.1921, "step": 29300 }, { "epoch": 4.161226227646892, "grad_norm": 4.155910968780518, "learning_rate": 4.306957088085225e-06, "loss": 1.2456, "step": 29320 }, { "epoch": 4.164064717570253, "grad_norm": 4.276477813720703, "learning_rate": 4.294715744119398e-06, "loss": 1.209, "step": 29340 }, { "epoch": 4.166903207493613, "grad_norm": 4.4617414474487305, "learning_rate": 4.282487062266035e-06, "loss": 1.2053, "step": 29360 }, { "epoch": 4.169741697416974, "grad_norm": 4.151037693023682, "learning_rate": 4.270271069665052e-06, "loss": 1.2089, "step": 29380 }, { "epoch": 4.172580187340335, "grad_norm": 4.0761613845825195, "learning_rate": 4.2580677934282034e-06, "loss": 1.2231, "step": 29400 }, { "epoch": 4.175418677263695, "grad_norm": 4.310911655426025, "learning_rate": 4.245877260639024e-06, "loss": 1.1917, "step": 29420 }, { "epoch": 4.1782571671870565, "grad_norm": 4.045643329620361, "learning_rate": 4.233699498352763e-06, "loss": 1.1888, "step": 29440 }, { "epoch": 4.181095657110418, "grad_norm": 4.377154350280762, "learning_rate": 4.221534533596331e-06, "loss": 1.2097, "step": 29460 }, { "epoch": 4.183934147033778, "grad_norm": 4.514101028442383, "learning_rate": 4.209382393368232e-06, "loss": 1.1867, "step": 29480 }, { "epoch": 4.186772636957139, "grad_norm": 3.9826836585998535, "learning_rate": 4.197243104638511e-06, "loss": 1.2269, "step": 29500 }, { "epoch": 4.189611126880499, "grad_norm": 4.262808322906494, "learning_rate": 4.185116694348685e-06, "loss": 1.2419, "step": 29520 }, { "epoch": 4.19244961680386, "grad_norm": 4.065732955932617, "learning_rate": 4.173003189411705e-06, "loss": 1.1853, "step": 29540 }, { "epoch": 4.195288106727221, "grad_norm": 4.307413578033447, "learning_rate": 4.160902616711856e-06, "loss": 1.2009, "step": 29560 }, { "epoch": 4.198126596650582, "grad_norm": 4.237000465393066, "learning_rate": 4.148815003104747e-06, "loss": 1.1844, "step": 29580 }, { "epoch": 4.200965086573943, "grad_norm": 4.150912761688232, "learning_rate": 4.136740375417199e-06, "loss": 1.2019, "step": 29600 }, { "epoch": 4.203803576497304, "grad_norm": 4.250378131866455, "learning_rate": 4.124678760447239e-06, "loss": 1.221, "step": 29620 }, { "epoch": 4.206642066420664, "grad_norm": 4.3246917724609375, "learning_rate": 4.112630184963999e-06, "loss": 1.1974, "step": 29640 }, { "epoch": 4.209480556344025, "grad_norm": 4.101644992828369, "learning_rate": 4.1005946757076745e-06, "loss": 1.1943, "step": 29660 }, { "epoch": 4.212319046267385, "grad_norm": 4.048989772796631, "learning_rate": 4.08857225938946e-06, "loss": 1.2184, "step": 29680 }, { "epoch": 4.2151575361907465, "grad_norm": 4.2792253494262695, "learning_rate": 4.076562962691499e-06, "loss": 1.2215, "step": 29700 }, { "epoch": 4.217996026114108, "grad_norm": 4.042673110961914, "learning_rate": 4.064566812266811e-06, "loss": 1.2161, "step": 29720 }, { "epoch": 4.220834516037468, "grad_norm": 4.266378879547119, "learning_rate": 4.052583834739239e-06, "loss": 1.2054, "step": 29740 }, { "epoch": 4.223673005960829, "grad_norm": 4.184782981872559, "learning_rate": 4.040614056703396e-06, "loss": 1.2521, "step": 29760 }, { "epoch": 4.226511495884189, "grad_norm": 4.369879722595215, "learning_rate": 4.028657504724593e-06, "loss": 1.1921, "step": 29780 }, { "epoch": 4.22934998580755, "grad_norm": 4.193240165710449, "learning_rate": 4.016714205338802e-06, "loss": 1.1925, "step": 29800 }, { "epoch": 4.232188475730911, "grad_norm": 4.394773483276367, "learning_rate": 4.004784185052559e-06, "loss": 1.2452, "step": 29820 }, { "epoch": 4.235026965654272, "grad_norm": 3.976792097091675, "learning_rate": 3.992867470342953e-06, "loss": 1.2153, "step": 29840 }, { "epoch": 4.237865455577633, "grad_norm": 4.469869613647461, "learning_rate": 3.9809640876575326e-06, "loss": 1.1821, "step": 29860 }, { "epoch": 4.240703945500994, "grad_norm": 4.369547367095947, "learning_rate": 3.969074063414247e-06, "loss": 1.1913, "step": 29880 }, { "epoch": 4.243542435424354, "grad_norm": 4.1876678466796875, "learning_rate": 3.957197424001418e-06, "loss": 1.1997, "step": 29900 }, { "epoch": 4.246380925347715, "grad_norm": 4.031350612640381, "learning_rate": 3.9453341957776506e-06, "loss": 1.2144, "step": 29920 }, { "epoch": 4.249219415271075, "grad_norm": 3.8776323795318604, "learning_rate": 3.933484405071787e-06, "loss": 1.1751, "step": 29940 }, { "epoch": 4.2520579051944365, "grad_norm": 4.159640312194824, "learning_rate": 3.921648078182845e-06, "loss": 1.215, "step": 29960 }, { "epoch": 4.254896395117798, "grad_norm": 4.052855014801025, "learning_rate": 3.909825241379965e-06, "loss": 1.2028, "step": 29980 }, { "epoch": 4.257734885041158, "grad_norm": 4.6220550537109375, "learning_rate": 3.8980159209023424e-06, "loss": 1.218, "step": 30000 }, { "epoch": 4.260573374964519, "grad_norm": 3.8969154357910156, "learning_rate": 3.886220142959188e-06, "loss": 1.1749, "step": 30020 }, { "epoch": 4.26341186488788, "grad_norm": 4.483845233917236, "learning_rate": 3.8744379337296375e-06, "loss": 1.1967, "step": 30040 }, { "epoch": 4.26625035481124, "grad_norm": 4.319317817687988, "learning_rate": 3.862669319362723e-06, "loss": 1.2002, "step": 30060 }, { "epoch": 4.269088844734601, "grad_norm": 4.391806125640869, "learning_rate": 3.850914325977314e-06, "loss": 1.2063, "step": 30080 }, { "epoch": 4.271927334657962, "grad_norm": 4.126306056976318, "learning_rate": 3.839172979662026e-06, "loss": 1.2228, "step": 30100 }, { "epoch": 4.274765824581323, "grad_norm": 4.443070411682129, "learning_rate": 3.827445306475215e-06, "loss": 1.2047, "step": 30120 }, { "epoch": 4.277604314504684, "grad_norm": 4.124887466430664, "learning_rate": 3.815731332444863e-06, "loss": 1.2, "step": 30140 }, { "epoch": 4.280442804428044, "grad_norm": 4.318425178527832, "learning_rate": 3.8040310835685744e-06, "loss": 1.2203, "step": 30160 }, { "epoch": 4.283281294351405, "grad_norm": 4.039337158203125, "learning_rate": 3.792344585813477e-06, "loss": 1.1799, "step": 30180 }, { "epoch": 4.286119784274765, "grad_norm": 4.0576701164245605, "learning_rate": 3.780671865116181e-06, "loss": 1.192, "step": 30200 }, { "epoch": 4.2889582741981265, "grad_norm": 4.1350884437561035, "learning_rate": 3.7690129473827262e-06, "loss": 1.2027, "step": 30220 }, { "epoch": 4.291796764121488, "grad_norm": 3.997584342956543, "learning_rate": 3.7573678584885143e-06, "loss": 1.1783, "step": 30240 }, { "epoch": 4.294635254044848, "grad_norm": 3.9795620441436768, "learning_rate": 3.7457366242782557e-06, "loss": 1.2022, "step": 30260 }, { "epoch": 4.297473743968209, "grad_norm": 3.974874973297119, "learning_rate": 3.7341192705659148e-06, "loss": 1.2098, "step": 30280 }, { "epoch": 4.30031223389157, "grad_norm": 4.166277885437012, "learning_rate": 3.722515823134647e-06, "loss": 1.1811, "step": 30300 }, { "epoch": 4.30315072381493, "grad_norm": 4.213529109954834, "learning_rate": 3.710926307736744e-06, "loss": 1.2088, "step": 30320 }, { "epoch": 4.305989213738291, "grad_norm": 4.167385578155518, "learning_rate": 3.6993507500935886e-06, "loss": 1.1837, "step": 30340 }, { "epoch": 4.3088277036616525, "grad_norm": 4.274970531463623, "learning_rate": 3.687789175895564e-06, "loss": 1.2427, "step": 30360 }, { "epoch": 4.311666193585013, "grad_norm": 4.354950904846191, "learning_rate": 3.6762416108020415e-06, "loss": 1.2185, "step": 30380 }, { "epoch": 4.314504683508374, "grad_norm": 4.169710636138916, "learning_rate": 3.6647080804412893e-06, "loss": 1.1888, "step": 30400 }, { "epoch": 4.317343173431734, "grad_norm": 4.0662522315979, "learning_rate": 3.6531886104104277e-06, "loss": 1.2043, "step": 30420 }, { "epoch": 4.320181663355095, "grad_norm": 4.089513778686523, "learning_rate": 3.6416832262753752e-06, "loss": 1.1796, "step": 30440 }, { "epoch": 4.323020153278456, "grad_norm": 4.583818435668945, "learning_rate": 3.6301919535707887e-06, "loss": 1.2218, "step": 30460 }, { "epoch": 4.3258586432018165, "grad_norm": 4.06287145614624, "learning_rate": 3.618714817800002e-06, "loss": 1.2172, "step": 30480 }, { "epoch": 4.328697133125178, "grad_norm": 4.500308990478516, "learning_rate": 3.607251844434979e-06, "loss": 1.1915, "step": 30500 }, { "epoch": 4.331535623048538, "grad_norm": 4.174163341522217, "learning_rate": 3.5958030589162506e-06, "loss": 1.2126, "step": 30520 }, { "epoch": 4.334374112971899, "grad_norm": 4.333515167236328, "learning_rate": 3.584368486652855e-06, "loss": 1.2191, "step": 30540 }, { "epoch": 4.33721260289526, "grad_norm": 3.900927782058716, "learning_rate": 3.5729481530222996e-06, "loss": 1.1918, "step": 30560 }, { "epoch": 4.34005109281862, "grad_norm": 4.163120746612549, "learning_rate": 3.5615420833704706e-06, "loss": 1.1976, "step": 30580 }, { "epoch": 4.342889582741981, "grad_norm": 4.039196491241455, "learning_rate": 3.550150303011616e-06, "loss": 1.1958, "step": 30600 }, { "epoch": 4.3457280726653424, "grad_norm": 4.160585880279541, "learning_rate": 3.5387728372282613e-06, "loss": 1.2101, "step": 30620 }, { "epoch": 4.348566562588703, "grad_norm": 4.21484899520874, "learning_rate": 3.5274097112711637e-06, "loss": 1.2233, "step": 30640 }, { "epoch": 4.351405052512064, "grad_norm": 4.071081638336182, "learning_rate": 3.5160609503592547e-06, "loss": 1.2394, "step": 30660 }, { "epoch": 4.354243542435424, "grad_norm": 4.245738983154297, "learning_rate": 3.504726579679587e-06, "loss": 1.2094, "step": 30680 }, { "epoch": 4.357082032358785, "grad_norm": 4.131201267242432, "learning_rate": 3.4934066243872742e-06, "loss": 1.1739, "step": 30700 }, { "epoch": 4.359920522282146, "grad_norm": 3.9100990295410156, "learning_rate": 3.482101109605438e-06, "loss": 1.1991, "step": 30720 }, { "epoch": 4.362759012205506, "grad_norm": 4.169238090515137, "learning_rate": 3.4708100604251495e-06, "loss": 1.1818, "step": 30740 }, { "epoch": 4.3655975021288675, "grad_norm": 4.1853461265563965, "learning_rate": 3.459533501905373e-06, "loss": 1.2051, "step": 30760 }, { "epoch": 4.368435992052229, "grad_norm": 3.8853559494018555, "learning_rate": 3.448271459072927e-06, "loss": 1.1792, "step": 30780 }, { "epoch": 4.371274481975589, "grad_norm": 3.5924649238586426, "learning_rate": 3.4370239569223917e-06, "loss": 1.2006, "step": 30800 }, { "epoch": 4.37411297189895, "grad_norm": 4.409698963165283, "learning_rate": 3.425791020416095e-06, "loss": 1.22, "step": 30820 }, { "epoch": 4.37695146182231, "grad_norm": 4.433238506317139, "learning_rate": 3.414572674484031e-06, "loss": 1.2538, "step": 30840 }, { "epoch": 4.379789951745671, "grad_norm": 3.991166353225708, "learning_rate": 3.4033689440238104e-06, "loss": 1.2425, "step": 30860 }, { "epoch": 4.382628441669032, "grad_norm": 4.196600437164307, "learning_rate": 3.3921798539006103e-06, "loss": 1.2206, "step": 30880 }, { "epoch": 4.385466931592393, "grad_norm": 4.038817405700684, "learning_rate": 3.3810054289471117e-06, "loss": 1.2061, "step": 30900 }, { "epoch": 4.388305421515754, "grad_norm": 3.901364326477051, "learning_rate": 3.3698456939634538e-06, "loss": 1.1678, "step": 30920 }, { "epoch": 4.391143911439114, "grad_norm": 4.37705659866333, "learning_rate": 3.358700673717168e-06, "loss": 1.2217, "step": 30940 }, { "epoch": 4.393982401362475, "grad_norm": 4.2602219581604, "learning_rate": 3.3475703929431315e-06, "loss": 1.2489, "step": 30960 }, { "epoch": 4.396820891285836, "grad_norm": 4.24072790145874, "learning_rate": 3.3364548763435044e-06, "loss": 1.1991, "step": 30980 }, { "epoch": 4.399659381209196, "grad_norm": 4.103697299957275, "learning_rate": 3.325354148587694e-06, "loss": 1.1953, "step": 31000 }, { "epoch": 4.4024978711325575, "grad_norm": 4.283117294311523, "learning_rate": 3.314268234312261e-06, "loss": 1.1841, "step": 31020 }, { "epoch": 4.405336361055919, "grad_norm": 4.507607936859131, "learning_rate": 3.303197158120918e-06, "loss": 1.2109, "step": 31040 }, { "epoch": 4.408174850979279, "grad_norm": 3.9769234657287598, "learning_rate": 3.292140944584419e-06, "loss": 1.2234, "step": 31060 }, { "epoch": 4.41101334090264, "grad_norm": 4.049487113952637, "learning_rate": 3.281099618240555e-06, "loss": 1.2269, "step": 31080 }, { "epoch": 4.413851830826, "grad_norm": 4.103230953216553, "learning_rate": 3.270073203594064e-06, "loss": 1.1976, "step": 31100 }, { "epoch": 4.416690320749361, "grad_norm": 3.9699721336364746, "learning_rate": 3.2590617251165947e-06, "loss": 1.2399, "step": 31120 }, { "epoch": 4.419528810672722, "grad_norm": 4.285155296325684, "learning_rate": 3.2480652072466433e-06, "loss": 1.1987, "step": 31140 }, { "epoch": 4.422367300596083, "grad_norm": 4.1625237464904785, "learning_rate": 3.2370836743895053e-06, "loss": 1.1917, "step": 31160 }, { "epoch": 4.425205790519444, "grad_norm": 4.038602352142334, "learning_rate": 3.2261171509172194e-06, "loss": 1.2089, "step": 31180 }, { "epoch": 4.428044280442805, "grad_norm": 3.9457907676696777, "learning_rate": 3.215165661168512e-06, "loss": 1.2144, "step": 31200 }, { "epoch": 4.430882770366165, "grad_norm": 4.203016757965088, "learning_rate": 3.204229229448743e-06, "loss": 1.197, "step": 31220 }, { "epoch": 4.433721260289526, "grad_norm": 4.498661041259766, "learning_rate": 3.1933078800298534e-06, "loss": 1.2163, "step": 31240 }, { "epoch": 4.436559750212886, "grad_norm": 4.068940162658691, "learning_rate": 3.182401637150321e-06, "loss": 1.2139, "step": 31260 }, { "epoch": 4.4393982401362475, "grad_norm": 4.065382957458496, "learning_rate": 3.1715105250150725e-06, "loss": 1.2298, "step": 31280 }, { "epoch": 4.442236730059609, "grad_norm": 4.162303924560547, "learning_rate": 3.160634567795482e-06, "loss": 1.2244, "step": 31300 }, { "epoch": 4.445075219982969, "grad_norm": 4.115585803985596, "learning_rate": 3.1497737896292714e-06, "loss": 1.1914, "step": 31320 }, { "epoch": 4.44791370990633, "grad_norm": 4.224687099456787, "learning_rate": 3.13892821462048e-06, "loss": 1.2037, "step": 31340 }, { "epoch": 4.45075219982969, "grad_norm": 4.342637538909912, "learning_rate": 3.1280978668394057e-06, "loss": 1.2115, "step": 31360 }, { "epoch": 4.453590689753051, "grad_norm": 4.161805152893066, "learning_rate": 3.1172827703225516e-06, "loss": 1.2173, "step": 31380 }, { "epoch": 4.456429179676412, "grad_norm": 4.022461414337158, "learning_rate": 3.106482949072572e-06, "loss": 1.2045, "step": 31400 }, { "epoch": 4.459267669599773, "grad_norm": 4.145800590515137, "learning_rate": 3.0956984270582214e-06, "loss": 1.2611, "step": 31420 }, { "epoch": 4.462106159523134, "grad_norm": 4.350988388061523, "learning_rate": 3.0849292282142973e-06, "loss": 1.2069, "step": 31440 }, { "epoch": 4.464944649446495, "grad_norm": 4.197218418121338, "learning_rate": 3.0741753764415873e-06, "loss": 1.2319, "step": 31460 }, { "epoch": 4.467783139369855, "grad_norm": 4.1769208908081055, "learning_rate": 3.063436895606833e-06, "loss": 1.2069, "step": 31480 }, { "epoch": 4.470621629293216, "grad_norm": 4.333700656890869, "learning_rate": 3.0527138095426356e-06, "loss": 1.1914, "step": 31500 }, { "epoch": 4.473460119216576, "grad_norm": 4.445151329040527, "learning_rate": 3.042006142047458e-06, "loss": 1.2136, "step": 31520 }, { "epoch": 4.4762986091399375, "grad_norm": 4.29864501953125, "learning_rate": 3.0313139168855244e-06, "loss": 1.1802, "step": 31540 }, { "epoch": 4.479137099063299, "grad_norm": 4.161866664886475, "learning_rate": 3.0206371577867944e-06, "loss": 1.1856, "step": 31560 }, { "epoch": 4.481975588986659, "grad_norm": 4.3437275886535645, "learning_rate": 3.0099758884469045e-06, "loss": 1.243, "step": 31580 }, { "epoch": 4.48481407891002, "grad_norm": 4.18651819229126, "learning_rate": 2.9993301325271017e-06, "loss": 1.2343, "step": 31600 }, { "epoch": 4.487652568833381, "grad_norm": 4.188718795776367, "learning_rate": 2.9886999136542206e-06, "loss": 1.1895, "step": 31620 }, { "epoch": 4.490491058756741, "grad_norm": 4.216392517089844, "learning_rate": 2.9780852554206017e-06, "loss": 1.2131, "step": 31640 }, { "epoch": 4.493329548680102, "grad_norm": 4.252201557159424, "learning_rate": 2.967486181384055e-06, "loss": 1.2303, "step": 31660 }, { "epoch": 4.496168038603463, "grad_norm": 4.020390510559082, "learning_rate": 2.956902715067799e-06, "loss": 1.1795, "step": 31680 }, { "epoch": 4.499006528526824, "grad_norm": 4.54580545425415, "learning_rate": 2.9463348799604253e-06, "loss": 1.2078, "step": 31700 }, { "epoch": 4.501845018450185, "grad_norm": 4.470859527587891, "learning_rate": 2.935782699515812e-06, "loss": 1.2041, "step": 31720 }, { "epoch": 4.504683508373545, "grad_norm": 4.217555999755859, "learning_rate": 2.925246197153121e-06, "loss": 1.1847, "step": 31740 }, { "epoch": 4.507521998296906, "grad_norm": 4.122226238250732, "learning_rate": 2.9147253962566934e-06, "loss": 1.2303, "step": 31760 }, { "epoch": 4.510360488220266, "grad_norm": 4.357614517211914, "learning_rate": 2.9042203201760364e-06, "loss": 1.1947, "step": 31780 }, { "epoch": 4.5131989781436275, "grad_norm": 4.2126922607421875, "learning_rate": 2.893730992225765e-06, "loss": 1.2015, "step": 31800 }, { "epoch": 4.516037468066989, "grad_norm": 4.190207004547119, "learning_rate": 2.88325743568552e-06, "loss": 1.2326, "step": 31820 }, { "epoch": 4.518875957990349, "grad_norm": 4.2659406661987305, "learning_rate": 2.8727996737999665e-06, "loss": 1.2099, "step": 31840 }, { "epoch": 4.52171444791371, "grad_norm": 4.364379405975342, "learning_rate": 2.8623577297786962e-06, "loss": 1.1979, "step": 31860 }, { "epoch": 4.524552937837071, "grad_norm": 4.40162467956543, "learning_rate": 2.8519316267962062e-06, "loss": 1.1872, "step": 31880 }, { "epoch": 4.527391427760431, "grad_norm": 4.120308876037598, "learning_rate": 2.8415213879918256e-06, "loss": 1.2217, "step": 31900 }, { "epoch": 4.530229917683792, "grad_norm": 4.366111755371094, "learning_rate": 2.831127036469693e-06, "loss": 1.2202, "step": 31920 }, { "epoch": 4.5330684076071535, "grad_norm": 4.321118354797363, "learning_rate": 2.820748595298667e-06, "loss": 1.2222, "step": 31940 }, { "epoch": 4.535906897530514, "grad_norm": 4.022904396057129, "learning_rate": 2.8103860875123057e-06, "loss": 1.2119, "step": 31960 }, { "epoch": 4.538745387453875, "grad_norm": 4.2673211097717285, "learning_rate": 2.800039536108806e-06, "loss": 1.1917, "step": 31980 }, { "epoch": 4.541583877377235, "grad_norm": 4.0569586753845215, "learning_rate": 2.7897089640509466e-06, "loss": 1.2298, "step": 32000 }, { "epoch": 4.544422367300596, "grad_norm": 4.326756954193115, "learning_rate": 2.779394394266053e-06, "loss": 1.2072, "step": 32020 }, { "epoch": 4.547260857223957, "grad_norm": 4.038242816925049, "learning_rate": 2.7690958496459164e-06, "loss": 1.221, "step": 32040 }, { "epoch": 4.5500993471473175, "grad_norm": 4.20168924331665, "learning_rate": 2.7588133530467844e-06, "loss": 1.2081, "step": 32060 }, { "epoch": 4.552937837070679, "grad_norm": 4.103909969329834, "learning_rate": 2.748546927289273e-06, "loss": 1.2258, "step": 32080 }, { "epoch": 4.555776326994039, "grad_norm": 4.321893692016602, "learning_rate": 2.7382965951583364e-06, "loss": 1.1837, "step": 32100 }, { "epoch": 4.5586148169174, "grad_norm": 4.376620769500732, "learning_rate": 2.7280623794032103e-06, "loss": 1.2177, "step": 32120 }, { "epoch": 4.561453306840761, "grad_norm": 4.131616592407227, "learning_rate": 2.7178443027373634e-06, "loss": 1.2105, "step": 32140 }, { "epoch": 4.564291796764121, "grad_norm": 4.161381244659424, "learning_rate": 2.707642387838446e-06, "loss": 1.1833, "step": 32160 }, { "epoch": 4.567130286687482, "grad_norm": 4.320641040802002, "learning_rate": 2.6974566573482362e-06, "loss": 1.1887, "step": 32180 }, { "epoch": 4.5699687766108426, "grad_norm": 4.212276935577393, "learning_rate": 2.687287133872597e-06, "loss": 1.1967, "step": 32200 }, { "epoch": 4.572807266534204, "grad_norm": 4.248649597167969, "learning_rate": 2.6771338399814175e-06, "loss": 1.2092, "step": 32220 }, { "epoch": 4.575645756457565, "grad_norm": 4.448605537414551, "learning_rate": 2.6669967982085777e-06, "loss": 1.2109, "step": 32240 }, { "epoch": 4.578484246380925, "grad_norm": 3.9797677993774414, "learning_rate": 2.6568760310518705e-06, "loss": 1.167, "step": 32260 }, { "epoch": 4.581322736304286, "grad_norm": 4.243673324584961, "learning_rate": 2.646771560972992e-06, "loss": 1.1983, "step": 32280 }, { "epoch": 4.584161226227647, "grad_norm": 4.120595932006836, "learning_rate": 2.636683410397444e-06, "loss": 1.1985, "step": 32300 }, { "epoch": 4.586999716151007, "grad_norm": 4.35316801071167, "learning_rate": 2.6266116017145316e-06, "loss": 1.2052, "step": 32320 }, { "epoch": 4.5898382060743685, "grad_norm": 4.310662269592285, "learning_rate": 2.6165561572772802e-06, "loss": 1.2124, "step": 32340 }, { "epoch": 4.59267669599773, "grad_norm": 4.186830520629883, "learning_rate": 2.606517099402397e-06, "loss": 1.2313, "step": 32360 }, { "epoch": 4.59551518592109, "grad_norm": 4.295746326446533, "learning_rate": 2.5964944503702237e-06, "loss": 1.1966, "step": 32380 }, { "epoch": 4.598353675844451, "grad_norm": 4.3582353591918945, "learning_rate": 2.586488232424685e-06, "loss": 1.2242, "step": 32400 }, { "epoch": 4.601192165767811, "grad_norm": 4.26339864730835, "learning_rate": 2.5764984677732385e-06, "loss": 1.2212, "step": 32420 }, { "epoch": 4.604030655691172, "grad_norm": 4.1733012199401855, "learning_rate": 2.566525178586822e-06, "loss": 1.2249, "step": 32440 }, { "epoch": 4.606869145614533, "grad_norm": 4.199159145355225, "learning_rate": 2.5565683869998204e-06, "loss": 1.2517, "step": 32460 }, { "epoch": 4.609707635537894, "grad_norm": 4.591892242431641, "learning_rate": 2.5466281151099846e-06, "loss": 1.2172, "step": 32480 }, { "epoch": 4.612546125461255, "grad_norm": 4.2332048416137695, "learning_rate": 2.5367043849784256e-06, "loss": 1.2092, "step": 32500 }, { "epoch": 4.615384615384615, "grad_norm": 4.124276638031006, "learning_rate": 2.5267972186295174e-06, "loss": 1.1985, "step": 32520 }, { "epoch": 4.618223105307976, "grad_norm": 4.14388370513916, "learning_rate": 2.5169066380508967e-06, "loss": 1.2036, "step": 32540 }, { "epoch": 4.621061595231337, "grad_norm": 4.129189968109131, "learning_rate": 2.507032665193374e-06, "loss": 1.2387, "step": 32560 }, { "epoch": 4.623900085154697, "grad_norm": 3.9499118328094482, "learning_rate": 2.497175321970907e-06, "loss": 1.2017, "step": 32580 }, { "epoch": 4.6267385750780585, "grad_norm": 4.234541893005371, "learning_rate": 2.487334630260546e-06, "loss": 1.164, "step": 32600 }, { "epoch": 4.62957706500142, "grad_norm": 4.327895641326904, "learning_rate": 2.477510611902385e-06, "loss": 1.2145, "step": 32620 }, { "epoch": 4.63241555492478, "grad_norm": 4.4970927238464355, "learning_rate": 2.4677032886995143e-06, "loss": 1.2411, "step": 32640 }, { "epoch": 4.635254044848141, "grad_norm": 4.419497013092041, "learning_rate": 2.45791268241797e-06, "loss": 1.219, "step": 32660 }, { "epoch": 4.638092534771501, "grad_norm": 4.27142858505249, "learning_rate": 2.448138814786689e-06, "loss": 1.2065, "step": 32680 }, { "epoch": 4.640931024694862, "grad_norm": 4.417922496795654, "learning_rate": 2.4383817074974557e-06, "loss": 1.2521, "step": 32700 }, { "epoch": 4.643769514618223, "grad_norm": 4.192352771759033, "learning_rate": 2.4286413822048692e-06, "loss": 1.214, "step": 32720 }, { "epoch": 4.646608004541584, "grad_norm": 4.056733131408691, "learning_rate": 2.4189178605262597e-06, "loss": 1.2094, "step": 32740 }, { "epoch": 4.649446494464945, "grad_norm": 4.075312614440918, "learning_rate": 2.4092111640416893e-06, "loss": 1.2116, "step": 32760 }, { "epoch": 4.652284984388306, "grad_norm": 4.197427272796631, "learning_rate": 2.3995213142938646e-06, "loss": 1.2175, "step": 32780 }, { "epoch": 4.655123474311666, "grad_norm": 4.053891658782959, "learning_rate": 2.389848332788105e-06, "loss": 1.2009, "step": 32800 }, { "epoch": 4.657961964235027, "grad_norm": 4.301068305969238, "learning_rate": 2.380192240992295e-06, "loss": 1.2282, "step": 32820 }, { "epoch": 4.660800454158387, "grad_norm": 3.963064670562744, "learning_rate": 2.3705530603368343e-06, "loss": 1.1622, "step": 32840 }, { "epoch": 4.6636389440817485, "grad_norm": 4.0677385330200195, "learning_rate": 2.360930812214589e-06, "loss": 1.1986, "step": 32860 }, { "epoch": 4.66647743400511, "grad_norm": 3.8505804538726807, "learning_rate": 2.3513255179808493e-06, "loss": 1.224, "step": 32880 }, { "epoch": 4.66931592392847, "grad_norm": 4.1468729972839355, "learning_rate": 2.3417371989532733e-06, "loss": 1.1953, "step": 32900 }, { "epoch": 4.672154413851831, "grad_norm": 4.339526176452637, "learning_rate": 2.332165876411847e-06, "loss": 1.1969, "step": 32920 }, { "epoch": 4.674992903775191, "grad_norm": 4.04060697555542, "learning_rate": 2.3226115715988453e-06, "loss": 1.2494, "step": 32940 }, { "epoch": 4.677831393698552, "grad_norm": 4.335938930511475, "learning_rate": 2.313074305718751e-06, "loss": 1.1851, "step": 32960 }, { "epoch": 4.680669883621913, "grad_norm": 4.342891216278076, "learning_rate": 2.3035540999382553e-06, "loss": 1.2214, "step": 32980 }, { "epoch": 4.683508373545274, "grad_norm": 4.167670726776123, "learning_rate": 2.294050975386174e-06, "loss": 1.2079, "step": 33000 }, { "epoch": 4.686346863468635, "grad_norm": 4.16892671585083, "learning_rate": 2.2845649531534154e-06, "loss": 1.1893, "step": 33020 }, { "epoch": 4.689185353391996, "grad_norm": 4.217544078826904, "learning_rate": 2.2750960542929335e-06, "loss": 1.214, "step": 33040 }, { "epoch": 4.692023843315356, "grad_norm": 4.109299182891846, "learning_rate": 2.2656442998196783e-06, "loss": 1.2119, "step": 33060 }, { "epoch": 4.694862333238717, "grad_norm": 4.418539047241211, "learning_rate": 2.2562097107105484e-06, "loss": 1.2264, "step": 33080 }, { "epoch": 4.697700823162078, "grad_norm": 4.301245212554932, "learning_rate": 2.2467923079043487e-06, "loss": 1.1928, "step": 33100 }, { "epoch": 4.7005393130854385, "grad_norm": 4.0132036209106445, "learning_rate": 2.2373921123017404e-06, "loss": 1.1701, "step": 33120 }, { "epoch": 4.7033778030088, "grad_norm": 4.730660915374756, "learning_rate": 2.228009144765192e-06, "loss": 1.2269, "step": 33140 }, { "epoch": 4.70621629293216, "grad_norm": 4.238565444946289, "learning_rate": 2.2186434261189505e-06, "loss": 1.2095, "step": 33160 }, { "epoch": 4.709054782855521, "grad_norm": 4.31150484085083, "learning_rate": 2.209294977148958e-06, "loss": 1.1918, "step": 33180 }, { "epoch": 4.711893272778882, "grad_norm": 4.1933274269104, "learning_rate": 2.199963818602855e-06, "loss": 1.1984, "step": 33200 }, { "epoch": 4.714731762702242, "grad_norm": 4.142677307128906, "learning_rate": 2.190649971189881e-06, "loss": 1.1803, "step": 33220 }, { "epoch": 4.717570252625603, "grad_norm": 4.011044979095459, "learning_rate": 2.1813534555808815e-06, "loss": 1.2194, "step": 33240 }, { "epoch": 4.720408742548964, "grad_norm": 4.171502113342285, "learning_rate": 2.1720742924082206e-06, "loss": 1.2012, "step": 33260 }, { "epoch": 4.723247232472325, "grad_norm": 4.2323503494262695, "learning_rate": 2.1628125022657554e-06, "loss": 1.2113, "step": 33280 }, { "epoch": 4.726085722395686, "grad_norm": 4.332464218139648, "learning_rate": 2.1535681057087856e-06, "loss": 1.2022, "step": 33300 }, { "epoch": 4.728924212319046, "grad_norm": 4.360069274902344, "learning_rate": 2.144341123254008e-06, "loss": 1.2121, "step": 33320 }, { "epoch": 4.731762702242407, "grad_norm": 4.153099536895752, "learning_rate": 2.135131575379472e-06, "loss": 1.2048, "step": 33340 }, { "epoch": 4.734601192165767, "grad_norm": 4.181245803833008, "learning_rate": 2.125939482524533e-06, "loss": 1.1758, "step": 33360 }, { "epoch": 4.7374396820891285, "grad_norm": 4.210803508758545, "learning_rate": 2.1167648650898055e-06, "loss": 1.1931, "step": 33380 }, { "epoch": 4.74027817201249, "grad_norm": 4.084834575653076, "learning_rate": 2.10760774343712e-06, "loss": 1.1858, "step": 33400 }, { "epoch": 4.74311666193585, "grad_norm": 4.261116981506348, "learning_rate": 2.0984681378894865e-06, "loss": 1.2243, "step": 33420 }, { "epoch": 4.745955151859211, "grad_norm": 4.039736270904541, "learning_rate": 2.0893460687310207e-06, "loss": 1.1819, "step": 33440 }, { "epoch": 4.748793641782572, "grad_norm": 4.18594217300415, "learning_rate": 2.08024155620694e-06, "loss": 1.2468, "step": 33460 }, { "epoch": 4.751632131705932, "grad_norm": 4.2600417137146, "learning_rate": 2.0711546205234866e-06, "loss": 1.2039, "step": 33480 }, { "epoch": 4.754470621629293, "grad_norm": 4.389910697937012, "learning_rate": 2.0620852818478876e-06, "loss": 1.2314, "step": 33500 }, { "epoch": 4.7573091115526545, "grad_norm": 4.058572292327881, "learning_rate": 2.0530335603083283e-06, "loss": 1.2362, "step": 33520 }, { "epoch": 4.760147601476015, "grad_norm": 4.076740264892578, "learning_rate": 2.0439994759938887e-06, "loss": 1.1938, "step": 33540 }, { "epoch": 4.762986091399376, "grad_norm": 4.24329948425293, "learning_rate": 2.0349830489545052e-06, "loss": 1.2231, "step": 33560 }, { "epoch": 4.765824581322736, "grad_norm": 3.9640092849731445, "learning_rate": 2.025984299200928e-06, "loss": 1.1937, "step": 33580 }, { "epoch": 4.768663071246097, "grad_norm": 4.1298136711120605, "learning_rate": 2.017003246704673e-06, "loss": 1.2313, "step": 33600 }, { "epoch": 4.771501561169458, "grad_norm": 4.268634796142578, "learning_rate": 2.0080399113979786e-06, "loss": 1.2112, "step": 33620 }, { "epoch": 4.7743400510928184, "grad_norm": 4.511301040649414, "learning_rate": 1.999094313173772e-06, "loss": 1.1909, "step": 33640 }, { "epoch": 4.77717854101618, "grad_norm": 4.131304740905762, "learning_rate": 1.9901664718855997e-06, "loss": 1.2074, "step": 33660 }, { "epoch": 4.78001703093954, "grad_norm": 4.284353256225586, "learning_rate": 1.9812564073476057e-06, "loss": 1.1925, "step": 33680 }, { "epoch": 4.782855520862901, "grad_norm": 4.380399703979492, "learning_rate": 1.9723641393344916e-06, "loss": 1.2165, "step": 33700 }, { "epoch": 4.785694010786262, "grad_norm": 4.205719470977783, "learning_rate": 1.9634896875814414e-06, "loss": 1.2187, "step": 33720 }, { "epoch": 4.788532500709622, "grad_norm": 4.410092830657959, "learning_rate": 1.954633071784119e-06, "loss": 1.2136, "step": 33740 }, { "epoch": 4.791370990632983, "grad_norm": 4.0550127029418945, "learning_rate": 1.9457943115985845e-06, "loss": 1.1964, "step": 33760 }, { "epoch": 4.7942094805563436, "grad_norm": 4.2895917892456055, "learning_rate": 1.9369734266412876e-06, "loss": 1.2247, "step": 33780 }, { "epoch": 4.797047970479705, "grad_norm": 3.9567370414733887, "learning_rate": 1.9281704364889954e-06, "loss": 1.2177, "step": 33800 }, { "epoch": 4.799886460403066, "grad_norm": 4.259525299072266, "learning_rate": 1.9193853606787614e-06, "loss": 1.2034, "step": 33820 }, { "epoch": 4.802724950326426, "grad_norm": 4.266725540161133, "learning_rate": 1.910618218707884e-06, "loss": 1.2342, "step": 33840 }, { "epoch": 4.805563440249787, "grad_norm": 3.970677375793457, "learning_rate": 1.9018690300338572e-06, "loss": 1.1938, "step": 33860 }, { "epoch": 4.808401930173148, "grad_norm": 4.229525089263916, "learning_rate": 1.8931378140743296e-06, "loss": 1.209, "step": 33880 }, { "epoch": 4.811240420096508, "grad_norm": 4.24655294418335, "learning_rate": 1.8844245902070635e-06, "loss": 1.2177, "step": 33900 }, { "epoch": 4.8140789100198695, "grad_norm": 4.393374443054199, "learning_rate": 1.8757293777698903e-06, "loss": 1.2282, "step": 33920 }, { "epoch": 4.816917399943231, "grad_norm": 4.151028156280518, "learning_rate": 1.8670521960606636e-06, "loss": 1.2407, "step": 33940 }, { "epoch": 4.819755889866591, "grad_norm": 4.1427130699157715, "learning_rate": 1.8583930643372306e-06, "loss": 1.2162, "step": 33960 }, { "epoch": 4.822594379789952, "grad_norm": 4.079700946807861, "learning_rate": 1.8497520018173609e-06, "loss": 1.1806, "step": 33980 }, { "epoch": 4.825432869713312, "grad_norm": 3.982774496078491, "learning_rate": 1.8411290276787419e-06, "loss": 1.2178, "step": 34000 }, { "epoch": 4.828271359636673, "grad_norm": 4.028487205505371, "learning_rate": 1.8325241610589028e-06, "loss": 1.2259, "step": 34020 }, { "epoch": 4.831109849560034, "grad_norm": 4.390022277832031, "learning_rate": 1.8239374210551907e-06, "loss": 1.1899, "step": 34040 }, { "epoch": 4.833948339483395, "grad_norm": 4.358603000640869, "learning_rate": 1.8153688267247237e-06, "loss": 1.2335, "step": 34060 }, { "epoch": 4.836786829406756, "grad_norm": 4.0550336837768555, "learning_rate": 1.8068183970843434e-06, "loss": 1.2393, "step": 34080 }, { "epoch": 4.839625319330116, "grad_norm": 4.3478617668151855, "learning_rate": 1.7982861511105832e-06, "loss": 1.2475, "step": 34100 }, { "epoch": 4.842463809253477, "grad_norm": 4.11893892288208, "learning_rate": 1.7897721077396168e-06, "loss": 1.2243, "step": 34120 }, { "epoch": 4.845302299176838, "grad_norm": 4.31845760345459, "learning_rate": 1.7812762858672195e-06, "loss": 1.2348, "step": 34140 }, { "epoch": 4.848140789100198, "grad_norm": 4.366200923919678, "learning_rate": 1.772798704348726e-06, "loss": 1.209, "step": 34160 }, { "epoch": 4.8509792790235595, "grad_norm": 4.416780948638916, "learning_rate": 1.7643393819989962e-06, "loss": 1.2011, "step": 34180 }, { "epoch": 4.85381776894692, "grad_norm": 4.350912570953369, "learning_rate": 1.7558983375923532e-06, "loss": 1.228, "step": 34200 }, { "epoch": 4.856656258870281, "grad_norm": 4.1305975914001465, "learning_rate": 1.7474755898625672e-06, "loss": 1.2416, "step": 34220 }, { "epoch": 4.859494748793642, "grad_norm": 4.226979732513428, "learning_rate": 1.7390711575027942e-06, "loss": 1.1802, "step": 34240 }, { "epoch": 4.862333238717002, "grad_norm": 4.308587551116943, "learning_rate": 1.7306850591655421e-06, "loss": 1.1922, "step": 34260 }, { "epoch": 4.865171728640363, "grad_norm": 4.236902236938477, "learning_rate": 1.7223173134626335e-06, "loss": 1.2036, "step": 34280 }, { "epoch": 4.868010218563724, "grad_norm": 4.175553321838379, "learning_rate": 1.7139679389651531e-06, "loss": 1.2113, "step": 34300 }, { "epoch": 4.870848708487085, "grad_norm": 4.115438461303711, "learning_rate": 1.705636954203419e-06, "loss": 1.2174, "step": 34320 }, { "epoch": 4.873687198410446, "grad_norm": 4.232748031616211, "learning_rate": 1.6973243776669335e-06, "loss": 1.2288, "step": 34340 }, { "epoch": 4.876525688333807, "grad_norm": 4.354818820953369, "learning_rate": 1.6890302278043424e-06, "loss": 1.2366, "step": 34360 }, { "epoch": 4.879364178257167, "grad_norm": 4.017809867858887, "learning_rate": 1.6807545230233956e-06, "loss": 1.2063, "step": 34380 }, { "epoch": 4.882202668180528, "grad_norm": 4.242778301239014, "learning_rate": 1.672497281690918e-06, "loss": 1.2133, "step": 34400 }, { "epoch": 4.885041158103888, "grad_norm": 4.112461090087891, "learning_rate": 1.6642585221327379e-06, "loss": 1.2272, "step": 34420 }, { "epoch": 4.8878796480272495, "grad_norm": 3.968937397003174, "learning_rate": 1.6560382626336846e-06, "loss": 1.2091, "step": 34440 }, { "epoch": 4.890718137950611, "grad_norm": 4.409791469573975, "learning_rate": 1.6478365214375113e-06, "loss": 1.2322, "step": 34460 }, { "epoch": 4.893556627873971, "grad_norm": 4.174006462097168, "learning_rate": 1.6396533167468897e-06, "loss": 1.1609, "step": 34480 }, { "epoch": 4.896395117797332, "grad_norm": 4.501866817474365, "learning_rate": 1.6314886667233398e-06, "loss": 1.1834, "step": 34500 }, { "epoch": 4.899233607720692, "grad_norm": 4.279295921325684, "learning_rate": 1.6233425894872078e-06, "loss": 1.2234, "step": 34520 }, { "epoch": 4.902072097644053, "grad_norm": 3.8311116695404053, "learning_rate": 1.6152151031176178e-06, "loss": 1.2186, "step": 34540 }, { "epoch": 4.904910587567414, "grad_norm": 4.125194072723389, "learning_rate": 1.607106225652435e-06, "loss": 1.1962, "step": 34560 }, { "epoch": 4.907749077490775, "grad_norm": 4.068869590759277, "learning_rate": 1.5990159750882239e-06, "loss": 1.2275, "step": 34580 }, { "epoch": 4.910587567414136, "grad_norm": 4.224260330200195, "learning_rate": 1.5909443693802073e-06, "loss": 1.1878, "step": 34600 }, { "epoch": 4.913426057337497, "grad_norm": 4.254079818725586, "learning_rate": 1.582891426442238e-06, "loss": 1.2201, "step": 34620 }, { "epoch": 4.916264547260857, "grad_norm": 3.841064691543579, "learning_rate": 1.5748571641467313e-06, "loss": 1.1982, "step": 34640 }, { "epoch": 4.919103037184218, "grad_norm": 4.143589496612549, "learning_rate": 1.5668416003246644e-06, "loss": 1.2473, "step": 34660 }, { "epoch": 4.921941527107579, "grad_norm": 4.187138557434082, "learning_rate": 1.5588447527654937e-06, "loss": 1.2251, "step": 34680 }, { "epoch": 4.9247800170309395, "grad_norm": 4.191924571990967, "learning_rate": 1.550866639217158e-06, "loss": 1.2177, "step": 34700 }, { "epoch": 4.927618506954301, "grad_norm": 4.297159671783447, "learning_rate": 1.5429072773860066e-06, "loss": 1.2341, "step": 34720 }, { "epoch": 4.930456996877661, "grad_norm": 4.164078712463379, "learning_rate": 1.5349666849367729e-06, "loss": 1.2037, "step": 34740 }, { "epoch": 4.933295486801022, "grad_norm": 4.328807353973389, "learning_rate": 1.5270448794925374e-06, "loss": 1.2419, "step": 34760 }, { "epoch": 4.936133976724383, "grad_norm": 4.227936744689941, "learning_rate": 1.5191418786346833e-06, "loss": 1.1968, "step": 34780 }, { "epoch": 4.938972466647743, "grad_norm": 4.528786659240723, "learning_rate": 1.5112576999028594e-06, "loss": 1.2218, "step": 34800 }, { "epoch": 4.941810956571104, "grad_norm": 4.360342025756836, "learning_rate": 1.503392360794943e-06, "loss": 1.2238, "step": 34820 }, { "epoch": 4.944649446494465, "grad_norm": 4.534153461456299, "learning_rate": 1.4955458787669973e-06, "loss": 1.2273, "step": 34840 }, { "epoch": 4.947487936417826, "grad_norm": 4.3392205238342285, "learning_rate": 1.4877182712332338e-06, "loss": 1.256, "step": 34860 }, { "epoch": 4.950326426341187, "grad_norm": 4.143593788146973, "learning_rate": 1.4799095555659837e-06, "loss": 1.1888, "step": 34880 }, { "epoch": 4.953164916264547, "grad_norm": 4.1924543380737305, "learning_rate": 1.472119749095634e-06, "loss": 1.2058, "step": 34900 }, { "epoch": 4.956003406187908, "grad_norm": 4.173058986663818, "learning_rate": 1.464348869110621e-06, "loss": 1.2098, "step": 34920 }, { "epoch": 4.958841896111268, "grad_norm": 3.922661304473877, "learning_rate": 1.456596932857368e-06, "loss": 1.1945, "step": 34940 }, { "epoch": 4.9616803860346295, "grad_norm": 4.184589862823486, "learning_rate": 1.4488639575402552e-06, "loss": 1.2142, "step": 34960 }, { "epoch": 4.964518875957991, "grad_norm": 4.35450553894043, "learning_rate": 1.4411499603215873e-06, "loss": 1.1731, "step": 34980 }, { "epoch": 4.967357365881351, "grad_norm": 4.036869049072266, "learning_rate": 1.4334549583215362e-06, "loss": 1.1855, "step": 35000 }, { "epoch": 4.970195855804712, "grad_norm": 4.174251079559326, "learning_rate": 1.4257789686181355e-06, "loss": 1.2064, "step": 35020 }, { "epoch": 4.973034345728073, "grad_norm": 4.162795066833496, "learning_rate": 1.4181220082472091e-06, "loss": 1.2026, "step": 35040 }, { "epoch": 4.975872835651433, "grad_norm": 4.385854721069336, "learning_rate": 1.4104840942023545e-06, "loss": 1.2102, "step": 35060 }, { "epoch": 4.978711325574794, "grad_norm": 4.321869850158691, "learning_rate": 1.4028652434348934e-06, "loss": 1.2147, "step": 35080 }, { "epoch": 4.9815498154981555, "grad_norm": 4.551620960235596, "learning_rate": 1.3952654728538483e-06, "loss": 1.2233, "step": 35100 }, { "epoch": 4.984388305421516, "grad_norm": 4.565370559692383, "learning_rate": 1.3876847993258825e-06, "loss": 1.2106, "step": 35120 }, { "epoch": 4.987226795344877, "grad_norm": 4.226864814758301, "learning_rate": 1.3801232396752884e-06, "loss": 1.1767, "step": 35140 }, { "epoch": 4.990065285268237, "grad_norm": 4.240734577178955, "learning_rate": 1.3725808106839321e-06, "loss": 1.1995, "step": 35160 }, { "epoch": 4.992903775191598, "grad_norm": 4.105297565460205, "learning_rate": 1.365057529091217e-06, "loss": 1.185, "step": 35180 }, { "epoch": 4.995742265114959, "grad_norm": 4.137646675109863, "learning_rate": 1.357553411594066e-06, "loss": 1.2319, "step": 35200 }, { "epoch": 4.9985807550383194, "grad_norm": 4.07985258102417, "learning_rate": 1.3500684748468485e-06, "loss": 1.2052, "step": 35220 }, { "epoch": 5.0014192449616806, "grad_norm": 3.8502490520477295, "learning_rate": 1.3426027354613858e-06, "loss": 1.1398, "step": 35240 }, { "epoch": 5.004257734885041, "grad_norm": 4.384532451629639, "learning_rate": 1.3351562100068816e-06, "loss": 1.0872, "step": 35260 }, { "epoch": 5.007096224808402, "grad_norm": 4.536736011505127, "learning_rate": 1.3277289150098992e-06, "loss": 1.0535, "step": 35280 }, { "epoch": 5.009934714731763, "grad_norm": 4.548092365264893, "learning_rate": 1.320320866954321e-06, "loss": 1.0483, "step": 35300 }, { "epoch": 5.012773204655123, "grad_norm": 4.657932758331299, "learning_rate": 1.3129320822813207e-06, "loss": 1.0736, "step": 35320 }, { "epoch": 5.015611694578484, "grad_norm": 4.21279239654541, "learning_rate": 1.3055625773893066e-06, "loss": 1.0674, "step": 35340 }, { "epoch": 5.018450184501845, "grad_norm": 4.359063625335693, "learning_rate": 1.2982123686339121e-06, "loss": 1.0537, "step": 35360 }, { "epoch": 5.021288674425206, "grad_norm": 4.417820453643799, "learning_rate": 1.290881472327934e-06, "loss": 1.0536, "step": 35380 }, { "epoch": 5.024127164348567, "grad_norm": 4.734997749328613, "learning_rate": 1.283569904741312e-06, "loss": 1.0402, "step": 35400 }, { "epoch": 5.026965654271927, "grad_norm": 4.414511203765869, "learning_rate": 1.2762776821010958e-06, "loss": 1.0568, "step": 35420 }, { "epoch": 5.029804144195288, "grad_norm": 4.5938568115234375, "learning_rate": 1.2690048205913842e-06, "loss": 1.0602, "step": 35440 }, { "epoch": 5.032642634118649, "grad_norm": 4.728695869445801, "learning_rate": 1.2617513363533252e-06, "loss": 1.0841, "step": 35460 }, { "epoch": 5.035481124042009, "grad_norm": 4.49623441696167, "learning_rate": 1.2545172454850496e-06, "loss": 1.0608, "step": 35480 }, { "epoch": 5.0383196139653705, "grad_norm": 4.518426418304443, "learning_rate": 1.2473025640416525e-06, "loss": 1.0638, "step": 35500 }, { "epoch": 5.041158103888731, "grad_norm": 4.238348960876465, "learning_rate": 1.24010730803515e-06, "loss": 1.0469, "step": 35520 }, { "epoch": 5.043996593812092, "grad_norm": 4.233331680297852, "learning_rate": 1.232931493434446e-06, "loss": 1.0584, "step": 35540 }, { "epoch": 5.046835083735453, "grad_norm": 4.4011311531066895, "learning_rate": 1.2257751361652991e-06, "loss": 1.0743, "step": 35560 }, { "epoch": 5.049673573658813, "grad_norm": 4.8644280433654785, "learning_rate": 1.2186382521102823e-06, "loss": 1.0884, "step": 35580 }, { "epoch": 5.052512063582174, "grad_norm": 4.569098949432373, "learning_rate": 1.2115208571087532e-06, "loss": 1.0619, "step": 35600 }, { "epoch": 5.055350553505535, "grad_norm": 4.33804988861084, "learning_rate": 1.204422966956811e-06, "loss": 1.047, "step": 35620 }, { "epoch": 5.058189043428896, "grad_norm": 5.038528919219971, "learning_rate": 1.1973445974072762e-06, "loss": 1.0773, "step": 35640 }, { "epoch": 5.061027533352257, "grad_norm": 4.851517677307129, "learning_rate": 1.190285764169632e-06, "loss": 1.0802, "step": 35660 }, { "epoch": 5.063866023275617, "grad_norm": 4.5254807472229, "learning_rate": 1.1832464829100177e-06, "loss": 1.0555, "step": 35680 }, { "epoch": 5.066704513198978, "grad_norm": 4.638580322265625, "learning_rate": 1.1762267692511697e-06, "loss": 1.062, "step": 35700 }, { "epoch": 5.069543003122339, "grad_norm": 4.373376846313477, "learning_rate": 1.1692266387724005e-06, "loss": 1.0512, "step": 35720 }, { "epoch": 5.072381493045699, "grad_norm": 4.7300519943237305, "learning_rate": 1.1622461070095604e-06, "loss": 1.0785, "step": 35740 }, { "epoch": 5.0752199829690605, "grad_norm": 4.6349287033081055, "learning_rate": 1.1552851894550032e-06, "loss": 1.0728, "step": 35760 }, { "epoch": 5.078058472892422, "grad_norm": 4.2931742668151855, "learning_rate": 1.1483439015575493e-06, "loss": 1.0424, "step": 35780 }, { "epoch": 5.080896962815782, "grad_norm": 4.776019096374512, "learning_rate": 1.1414222587224554e-06, "loss": 1.079, "step": 35800 }, { "epoch": 5.083735452739143, "grad_norm": 4.643527030944824, "learning_rate": 1.1345202763113805e-06, "loss": 1.0617, "step": 35820 }, { "epoch": 5.086573942662503, "grad_norm": 4.568332195281982, "learning_rate": 1.1276379696423434e-06, "loss": 1.0846, "step": 35840 }, { "epoch": 5.089412432585864, "grad_norm": 4.563784599304199, "learning_rate": 1.120775353989707e-06, "loss": 1.0541, "step": 35860 }, { "epoch": 5.092250922509225, "grad_norm": 4.7970170974731445, "learning_rate": 1.1139324445841182e-06, "loss": 1.037, "step": 35880 }, { "epoch": 5.095089412432586, "grad_norm": 4.761439323425293, "learning_rate": 1.107109256612503e-06, "loss": 1.0647, "step": 35900 }, { "epoch": 5.097927902355947, "grad_norm": 4.437709331512451, "learning_rate": 1.1003058052180026e-06, "loss": 1.07, "step": 35920 }, { "epoch": 5.100766392279308, "grad_norm": 4.4450297355651855, "learning_rate": 1.0935221054999701e-06, "loss": 1.0712, "step": 35940 }, { "epoch": 5.103604882202668, "grad_norm": 4.347862720489502, "learning_rate": 1.0867581725139154e-06, "loss": 1.0837, "step": 35960 }, { "epoch": 5.106443372126029, "grad_norm": 4.277023792266846, "learning_rate": 1.0800140212714783e-06, "loss": 1.0609, "step": 35980 }, { "epoch": 5.109281862049389, "grad_norm": 4.72116231918335, "learning_rate": 1.073289666740398e-06, "loss": 1.0648, "step": 36000 }, { "epoch": 5.1121203519727505, "grad_norm": 4.548429489135742, "learning_rate": 1.0665851238444758e-06, "loss": 1.0571, "step": 36020 }, { "epoch": 5.114958841896112, "grad_norm": 4.453185558319092, "learning_rate": 1.0599004074635455e-06, "loss": 1.0899, "step": 36040 }, { "epoch": 5.117797331819472, "grad_norm": 4.473193645477295, "learning_rate": 1.0532355324334364e-06, "loss": 1.0688, "step": 36060 }, { "epoch": 5.120635821742833, "grad_norm": 4.668159008026123, "learning_rate": 1.0465905135459465e-06, "loss": 1.0577, "step": 36080 }, { "epoch": 5.123474311666193, "grad_norm": 4.441940784454346, "learning_rate": 1.0399653655487984e-06, "loss": 1.0688, "step": 36100 }, { "epoch": 5.126312801589554, "grad_norm": 4.564089298248291, "learning_rate": 1.0333601031456252e-06, "loss": 1.08, "step": 36120 }, { "epoch": 5.129151291512915, "grad_norm": 4.672436714172363, "learning_rate": 1.0267747409959129e-06, "loss": 1.0583, "step": 36140 }, { "epoch": 5.131989781436276, "grad_norm": 4.576383590698242, "learning_rate": 1.0202092937149931e-06, "loss": 1.0856, "step": 36160 }, { "epoch": 5.134828271359637, "grad_norm": 4.700583457946777, "learning_rate": 1.0136637758739954e-06, "loss": 1.0746, "step": 36180 }, { "epoch": 5.137666761282998, "grad_norm": 4.70177698135376, "learning_rate": 1.0071382019998144e-06, "loss": 1.0801, "step": 36200 }, { "epoch": 5.140505251206358, "grad_norm": 4.711193084716797, "learning_rate": 1.0006325865750888e-06, "loss": 1.0629, "step": 36220 }, { "epoch": 5.143343741129719, "grad_norm": 4.595587253570557, "learning_rate": 9.941469440381556e-07, "loss": 1.0627, "step": 36240 }, { "epoch": 5.146182231053079, "grad_norm": 4.354851245880127, "learning_rate": 9.8768128878303e-07, "loss": 1.0614, "step": 36260 }, { "epoch": 5.1490207209764405, "grad_norm": 4.297418594360352, "learning_rate": 9.812356351593644e-07, "loss": 1.072, "step": 36280 }, { "epoch": 5.151859210899802, "grad_norm": 4.600295543670654, "learning_rate": 9.748099974724224e-07, "loss": 1.074, "step": 36300 }, { "epoch": 5.154697700823162, "grad_norm": 4.640811920166016, "learning_rate": 9.684043899830409e-07, "loss": 1.093, "step": 36320 }, { "epoch": 5.157536190746523, "grad_norm": 4.19462776184082, "learning_rate": 9.620188269076135e-07, "loss": 1.0606, "step": 36340 }, { "epoch": 5.160374680669884, "grad_norm": 4.433244228363037, "learning_rate": 9.55653322418031e-07, "loss": 1.056, "step": 36360 }, { "epoch": 5.163213170593244, "grad_norm": 4.183903694152832, "learning_rate": 9.49307890641683e-07, "loss": 1.0321, "step": 36380 }, { "epoch": 5.166051660516605, "grad_norm": 4.579616069793701, "learning_rate": 9.429825456614006e-07, "loss": 1.0429, "step": 36400 }, { "epoch": 5.168890150439966, "grad_norm": 4.937745094299316, "learning_rate": 9.366773015154385e-07, "loss": 1.0886, "step": 36420 }, { "epoch": 5.171728640363327, "grad_norm": 4.3714213371276855, "learning_rate": 9.303921721974385e-07, "loss": 1.0599, "step": 36440 }, { "epoch": 5.174567130286688, "grad_norm": 4.670811653137207, "learning_rate": 9.241271716564026e-07, "loss": 1.0809, "step": 36460 }, { "epoch": 5.177405620210048, "grad_norm": 4.609099864959717, "learning_rate": 9.178823137966575e-07, "loss": 1.0852, "step": 36480 }, { "epoch": 5.180244110133409, "grad_norm": 4.651702404022217, "learning_rate": 9.116576124778276e-07, "loss": 1.0728, "step": 36500 }, { "epoch": 5.183082600056769, "grad_norm": 4.682258605957031, "learning_rate": 9.05453081514801e-07, "loss": 1.0779, "step": 36520 }, { "epoch": 5.1859210899801305, "grad_norm": 4.4918084144592285, "learning_rate": 8.992687346776996e-07, "loss": 1.0493, "step": 36540 }, { "epoch": 5.188759579903492, "grad_norm": 4.745410919189453, "learning_rate": 8.931045856918563e-07, "loss": 1.0695, "step": 36560 }, { "epoch": 5.191598069826852, "grad_norm": 4.720880508422852, "learning_rate": 8.869606482377658e-07, "loss": 1.1004, "step": 36580 }, { "epoch": 5.194436559750213, "grad_norm": 4.386775016784668, "learning_rate": 8.808369359510793e-07, "loss": 1.0624, "step": 36600 }, { "epoch": 5.197275049673574, "grad_norm": 4.614798545837402, "learning_rate": 8.747334624225467e-07, "loss": 1.0561, "step": 36620 }, { "epoch": 5.200113539596934, "grad_norm": 4.992986679077148, "learning_rate": 8.686502411980124e-07, "loss": 1.058, "step": 36640 }, { "epoch": 5.202952029520295, "grad_norm": 4.5681047439575195, "learning_rate": 8.625872857783702e-07, "loss": 1.053, "step": 36660 }, { "epoch": 5.205790519443656, "grad_norm": 4.427273750305176, "learning_rate": 8.565446096195329e-07, "loss": 1.0796, "step": 36680 }, { "epoch": 5.208629009367017, "grad_norm": 4.618629455566406, "learning_rate": 8.505222261324098e-07, "loss": 1.0451, "step": 36700 }, { "epoch": 5.211467499290378, "grad_norm": 4.481493949890137, "learning_rate": 8.445201486828736e-07, "loss": 1.0541, "step": 36720 }, { "epoch": 5.214305989213738, "grad_norm": 4.724452972412109, "learning_rate": 8.385383905917277e-07, "loss": 1.0769, "step": 36740 }, { "epoch": 5.217144479137099, "grad_norm": 4.6223907470703125, "learning_rate": 8.325769651346816e-07, "loss": 1.0686, "step": 36760 }, { "epoch": 5.21998296906046, "grad_norm": 4.452687740325928, "learning_rate": 8.266358855423217e-07, "loss": 1.0594, "step": 36780 }, { "epoch": 5.22282145898382, "grad_norm": 4.707131385803223, "learning_rate": 8.207151650000711e-07, "loss": 1.0898, "step": 36800 }, { "epoch": 5.2256599489071816, "grad_norm": 4.733769416809082, "learning_rate": 8.148148166481806e-07, "loss": 1.0565, "step": 36820 }, { "epoch": 5.228498438830542, "grad_norm": 4.345157623291016, "learning_rate": 8.089348535816744e-07, "loss": 1.0752, "step": 36840 }, { "epoch": 5.231336928753903, "grad_norm": 4.9313249588012695, "learning_rate": 8.030752888503457e-07, "loss": 1.0621, "step": 36860 }, { "epoch": 5.234175418677264, "grad_norm": 4.523575782775879, "learning_rate": 7.972361354587133e-07, "loss": 1.0729, "step": 36880 }, { "epoch": 5.237013908600624, "grad_norm": 4.836673736572266, "learning_rate": 7.914174063659874e-07, "loss": 1.0577, "step": 36900 }, { "epoch": 5.239852398523985, "grad_norm": 4.65406608581543, "learning_rate": 7.856191144860614e-07, "loss": 1.0758, "step": 36920 }, { "epoch": 5.242690888447346, "grad_norm": 4.818166255950928, "learning_rate": 7.798412726874661e-07, "loss": 1.0604, "step": 36940 }, { "epoch": 5.245529378370707, "grad_norm": 4.638568878173828, "learning_rate": 7.740838937933425e-07, "loss": 1.0652, "step": 36960 }, { "epoch": 5.248367868294068, "grad_norm": 4.849806785583496, "learning_rate": 7.683469905814223e-07, "loss": 1.0485, "step": 36980 }, { "epoch": 5.251206358217428, "grad_norm": 4.704647541046143, "learning_rate": 7.626305757839913e-07, "loss": 1.0897, "step": 37000 }, { "epoch": 5.254044848140789, "grad_norm": 4.382594108581543, "learning_rate": 7.569346620878637e-07, "loss": 1.0478, "step": 37020 }, { "epoch": 5.25688333806415, "grad_norm": 4.546744346618652, "learning_rate": 7.512592621343584e-07, "loss": 1.0668, "step": 37040 }, { "epoch": 5.25972182798751, "grad_norm": 4.52442741394043, "learning_rate": 7.45604388519261e-07, "loss": 1.0709, "step": 37060 }, { "epoch": 5.2625603179108715, "grad_norm": 4.861525535583496, "learning_rate": 7.399700537928034e-07, "loss": 1.077, "step": 37080 }, { "epoch": 5.265398807834233, "grad_norm": 5.026726245880127, "learning_rate": 7.343562704596385e-07, "loss": 1.0768, "step": 37100 }, { "epoch": 5.268237297757593, "grad_norm": 4.580838680267334, "learning_rate": 7.287630509788013e-07, "loss": 1.0663, "step": 37120 }, { "epoch": 5.271075787680954, "grad_norm": 4.323355197906494, "learning_rate": 7.231904077636942e-07, "loss": 1.0533, "step": 37140 }, { "epoch": 5.273914277604314, "grad_norm": 4.886282444000244, "learning_rate": 7.176383531820486e-07, "loss": 1.0822, "step": 37160 }, { "epoch": 5.276752767527675, "grad_norm": 5.0224809646606445, "learning_rate": 7.121068995559066e-07, "loss": 1.0715, "step": 37180 }, { "epoch": 5.279591257451036, "grad_norm": 4.690140247344971, "learning_rate": 7.06596059161585e-07, "loss": 1.0715, "step": 37200 }, { "epoch": 5.282429747374397, "grad_norm": 4.7767534255981445, "learning_rate": 7.011058442296547e-07, "loss": 1.0414, "step": 37220 }, { "epoch": 5.285268237297758, "grad_norm": 4.610667705535889, "learning_rate": 6.956362669449112e-07, "loss": 1.1049, "step": 37240 }, { "epoch": 5.288106727221118, "grad_norm": 4.695881366729736, "learning_rate": 6.901873394463454e-07, "loss": 1.0616, "step": 37260 }, { "epoch": 5.290945217144479, "grad_norm": 4.57548189163208, "learning_rate": 6.847590738271226e-07, "loss": 1.0727, "step": 37280 }, { "epoch": 5.29378370706784, "grad_norm": 4.444995403289795, "learning_rate": 6.79351482134547e-07, "loss": 1.0716, "step": 37300 }, { "epoch": 5.2966221969912, "grad_norm": 4.370624542236328, "learning_rate": 6.739645763700464e-07, "loss": 1.0629, "step": 37320 }, { "epoch": 5.2994606869145615, "grad_norm": 4.850456714630127, "learning_rate": 6.685983684891295e-07, "loss": 1.0787, "step": 37340 }, { "epoch": 5.302299176837923, "grad_norm": 4.399477958679199, "learning_rate": 6.632528704013807e-07, "loss": 1.0757, "step": 37360 }, { "epoch": 5.305137666761283, "grad_norm": 4.806722640991211, "learning_rate": 6.57928093970408e-07, "loss": 1.0446, "step": 37380 }, { "epoch": 5.307976156684644, "grad_norm": 4.807369709014893, "learning_rate": 6.526240510138437e-07, "loss": 1.0571, "step": 37400 }, { "epoch": 5.310814646608004, "grad_norm": 4.490498065948486, "learning_rate": 6.473407533032971e-07, "loss": 1.0558, "step": 37420 }, { "epoch": 5.313653136531365, "grad_norm": 4.458512306213379, "learning_rate": 6.420782125643376e-07, "loss": 1.0463, "step": 37440 }, { "epoch": 5.316491626454726, "grad_norm": 4.627263069152832, "learning_rate": 6.368364404764693e-07, "loss": 1.0459, "step": 37460 }, { "epoch": 5.319330116378087, "grad_norm": 4.738402843475342, "learning_rate": 6.316154486730996e-07, "loss": 1.0536, "step": 37480 }, { "epoch": 5.322168606301448, "grad_norm": 4.503163814544678, "learning_rate": 6.264152487415209e-07, "loss": 1.0531, "step": 37500 }, { "epoch": 5.325007096224809, "grad_norm": 4.579478740692139, "learning_rate": 6.212358522228779e-07, "loss": 1.0342, "step": 37520 }, { "epoch": 5.327845586148169, "grad_norm": 5.158321380615234, "learning_rate": 6.160772706121454e-07, "loss": 1.0554, "step": 37540 }, { "epoch": 5.33068407607153, "grad_norm": 4.952829360961914, "learning_rate": 6.10939515358101e-07, "loss": 1.0871, "step": 37560 }, { "epoch": 5.33352256599489, "grad_norm": 4.5105509757995605, "learning_rate": 6.058225978633092e-07, "loss": 1.0679, "step": 37580 }, { "epoch": 5.3363610559182515, "grad_norm": 4.370230197906494, "learning_rate": 6.007265294840736e-07, "loss": 1.0462, "step": 37600 }, { "epoch": 5.339199545841613, "grad_norm": 4.809624195098877, "learning_rate": 5.956513215304393e-07, "loss": 1.0776, "step": 37620 }, { "epoch": 5.342038035764973, "grad_norm": 4.8498430252075195, "learning_rate": 5.905969852661464e-07, "loss": 1.0845, "step": 37640 }, { "epoch": 5.344876525688334, "grad_norm": 4.69082498550415, "learning_rate": 5.855635319086162e-07, "loss": 1.0682, "step": 37660 }, { "epoch": 5.347715015611694, "grad_norm": 4.215461730957031, "learning_rate": 5.805509726289238e-07, "loss": 1.0539, "step": 37680 }, { "epoch": 5.350553505535055, "grad_norm": 4.278838157653809, "learning_rate": 5.755593185517694e-07, "loss": 1.0622, "step": 37700 }, { "epoch": 5.353391995458416, "grad_norm": 4.521454811096191, "learning_rate": 5.705885807554612e-07, "loss": 1.0727, "step": 37720 }, { "epoch": 5.356230485381777, "grad_norm": 4.640308380126953, "learning_rate": 5.656387702718836e-07, "loss": 1.0502, "step": 37740 }, { "epoch": 5.359068975305138, "grad_norm": 4.410296440124512, "learning_rate": 5.607098980864756e-07, "loss": 1.082, "step": 37760 }, { "epoch": 5.361907465228499, "grad_norm": 4.144856929779053, "learning_rate": 5.55801975138206e-07, "loss": 1.0166, "step": 37780 }, { "epoch": 5.364745955151859, "grad_norm": 4.342030048370361, "learning_rate": 5.509150123195573e-07, "loss": 1.0926, "step": 37800 }, { "epoch": 5.36758444507522, "grad_norm": 4.528135299682617, "learning_rate": 5.46049020476479e-07, "loss": 1.0859, "step": 37820 }, { "epoch": 5.37042293499858, "grad_norm": 4.458469390869141, "learning_rate": 5.412040104083926e-07, "loss": 1.058, "step": 37840 }, { "epoch": 5.3732614249219415, "grad_norm": 4.72668981552124, "learning_rate": 5.363799928681457e-07, "loss": 1.0918, "step": 37860 }, { "epoch": 5.376099914845303, "grad_norm": 4.876331806182861, "learning_rate": 5.315769785619973e-07, "loss": 1.0628, "step": 37880 }, { "epoch": 5.378938404768663, "grad_norm": 4.593716621398926, "learning_rate": 5.26794978149594e-07, "loss": 1.0723, "step": 37900 }, { "epoch": 5.381776894692024, "grad_norm": 4.435773849487305, "learning_rate": 5.220340022439418e-07, "loss": 1.0417, "step": 37920 }, { "epoch": 5.384615384615385, "grad_norm": 4.3642964363098145, "learning_rate": 5.172940614113886e-07, "loss": 1.069, "step": 37940 }, { "epoch": 5.387453874538745, "grad_norm": 4.839729309082031, "learning_rate": 5.125751661715972e-07, "loss": 1.0582, "step": 37960 }, { "epoch": 5.390292364462106, "grad_norm": 4.72569465637207, "learning_rate": 5.078773269975212e-07, "loss": 1.0558, "step": 37980 }, { "epoch": 5.393130854385467, "grad_norm": 4.931765556335449, "learning_rate": 5.032005543153829e-07, "loss": 1.0705, "step": 38000 }, { "epoch": 5.395969344308828, "grad_norm": 4.640488624572754, "learning_rate": 4.985448585046559e-07, "loss": 1.0589, "step": 38020 }, { "epoch": 5.398807834232189, "grad_norm": 4.844178676605225, "learning_rate": 4.939102498980252e-07, "loss": 1.0646, "step": 38040 }, { "epoch": 5.401646324155549, "grad_norm": 4.734241485595703, "learning_rate": 4.892967387813907e-07, "loss": 1.0852, "step": 38060 }, { "epoch": 5.40448481407891, "grad_norm": 4.55892276763916, "learning_rate": 4.847043353938119e-07, "loss": 1.0632, "step": 38080 }, { "epoch": 5.40732330400227, "grad_norm": 4.763373851776123, "learning_rate": 4.801330499275181e-07, "loss": 1.076, "step": 38100 }, { "epoch": 5.4101617939256315, "grad_norm": 4.619705677032471, "learning_rate": 4.755828925278616e-07, "loss": 1.0802, "step": 38120 }, { "epoch": 5.413000283848993, "grad_norm": 4.242789268493652, "learning_rate": 4.7105387329330564e-07, "loss": 1.0693, "step": 38140 }, { "epoch": 5.415838773772353, "grad_norm": 4.69885778427124, "learning_rate": 4.665460022754009e-07, "loss": 1.101, "step": 38160 }, { "epoch": 5.418677263695714, "grad_norm": 4.40032958984375, "learning_rate": 4.6205928947876236e-07, "loss": 1.0588, "step": 38180 }, { "epoch": 5.421515753619075, "grad_norm": 4.7738213539123535, "learning_rate": 4.5759374486104455e-07, "loss": 1.0801, "step": 38200 }, { "epoch": 5.424354243542435, "grad_norm": 4.91384220123291, "learning_rate": 4.531493783329255e-07, "loss": 1.0753, "step": 38220 }, { "epoch": 5.427192733465796, "grad_norm": 4.337026119232178, "learning_rate": 4.4872619975808273e-07, "loss": 1.0397, "step": 38240 }, { "epoch": 5.430031223389157, "grad_norm": 4.219902515411377, "learning_rate": 4.443242189531627e-07, "loss": 1.0567, "step": 38260 }, { "epoch": 5.432869713312518, "grad_norm": 4.471268653869629, "learning_rate": 4.3994344568777493e-07, "loss": 1.0685, "step": 38280 }, { "epoch": 5.435708203235879, "grad_norm": 4.5118584632873535, "learning_rate": 4.355838896844533e-07, "loss": 1.0639, "step": 38300 }, { "epoch": 5.438546693159239, "grad_norm": 4.19152307510376, "learning_rate": 4.312455606186505e-07, "loss": 1.0556, "step": 38320 }, { "epoch": 5.4413851830826, "grad_norm": 4.371646404266357, "learning_rate": 4.269284681187036e-07, "loss": 1.0917, "step": 38340 }, { "epoch": 5.444223673005961, "grad_norm": 4.657122611999512, "learning_rate": 4.2263262176582165e-07, "loss": 1.0986, "step": 38360 }, { "epoch": 5.447062162929321, "grad_norm": 4.4890851974487305, "learning_rate": 4.1835803109405713e-07, "loss": 1.0766, "step": 38380 }, { "epoch": 5.4499006528526825, "grad_norm": 4.524057388305664, "learning_rate": 4.141047055902914e-07, "loss": 1.0904, "step": 38400 }, { "epoch": 5.452739142776043, "grad_norm": 4.820863246917725, "learning_rate": 4.098726546942089e-07, "loss": 1.0623, "step": 38420 }, { "epoch": 5.455577632699404, "grad_norm": 4.73241662979126, "learning_rate": 4.0566188779827764e-07, "loss": 1.0386, "step": 38440 }, { "epoch": 5.458416122622765, "grad_norm": 4.562949180603027, "learning_rate": 4.0147241424773107e-07, "loss": 1.0358, "step": 38460 }, { "epoch": 5.461254612546125, "grad_norm": 4.273356914520264, "learning_rate": 3.973042433405405e-07, "loss": 1.061, "step": 38480 }, { "epoch": 5.464093102469486, "grad_norm": 4.478457927703857, "learning_rate": 3.9315738432740613e-07, "loss": 1.0557, "step": 38500 }, { "epoch": 5.4669315923928465, "grad_norm": 4.778354167938232, "learning_rate": 3.8903184641172044e-07, "loss": 1.0736, "step": 38520 }, { "epoch": 5.469770082316208, "grad_norm": 4.756141662597656, "learning_rate": 3.8492763874956376e-07, "loss": 1.0807, "step": 38540 }, { "epoch": 5.472608572239569, "grad_norm": 4.252477169036865, "learning_rate": 3.8084477044967206e-07, "loss": 1.0517, "step": 38560 }, { "epoch": 5.475447062162929, "grad_norm": 4.6566643714904785, "learning_rate": 3.767832505734259e-07, "loss": 1.0589, "step": 38580 }, { "epoch": 5.47828555208629, "grad_norm": 4.465487480163574, "learning_rate": 3.727430881348226e-07, "loss": 1.0513, "step": 38600 }, { "epoch": 5.481124042009651, "grad_norm": 4.549440860748291, "learning_rate": 3.6872429210045635e-07, "loss": 1.0878, "step": 38620 }, { "epoch": 5.483962531933011, "grad_norm": 4.882506847381592, "learning_rate": 3.6472687138951026e-07, "loss": 1.077, "step": 38640 }, { "epoch": 5.4868010218563725, "grad_norm": 4.839640140533447, "learning_rate": 3.607508348737221e-07, "loss": 1.0458, "step": 38660 }, { "epoch": 5.489639511779734, "grad_norm": 4.570183753967285, "learning_rate": 3.5679619137736876e-07, "loss": 1.0516, "step": 38680 }, { "epoch": 5.492478001703094, "grad_norm": 4.547548294067383, "learning_rate": 3.5286294967725176e-07, "loss": 1.0563, "step": 38700 }, { "epoch": 5.495316491626455, "grad_norm": 4.749202728271484, "learning_rate": 3.4895111850267614e-07, "loss": 1.0617, "step": 38720 }, { "epoch": 5.498154981549815, "grad_norm": 4.74296760559082, "learning_rate": 3.4506070653542055e-07, "loss": 1.0687, "step": 38740 }, { "epoch": 5.500993471473176, "grad_norm": 4.71812105178833, "learning_rate": 3.411917224097361e-07, "loss": 1.0585, "step": 38760 }, { "epoch": 5.503831961396537, "grad_norm": 4.680538654327393, "learning_rate": 3.3734417471231297e-07, "loss": 1.0643, "step": 38780 }, { "epoch": 5.506670451319898, "grad_norm": 4.557133197784424, "learning_rate": 3.3351807198226507e-07, "loss": 1.0459, "step": 38800 }, { "epoch": 5.509508941243259, "grad_norm": 5.0500617027282715, "learning_rate": 3.2971342271111537e-07, "loss": 1.039, "step": 38820 }, { "epoch": 5.512347431166619, "grad_norm": 4.702068328857422, "learning_rate": 3.259302353427707e-07, "loss": 1.0234, "step": 38840 }, { "epoch": 5.51518592108998, "grad_norm": 4.927919864654541, "learning_rate": 3.2216851827350795e-07, "loss": 1.0518, "step": 38860 }, { "epoch": 5.518024411013341, "grad_norm": 4.516992092132568, "learning_rate": 3.1842827985195445e-07, "loss": 1.0869, "step": 38880 }, { "epoch": 5.520862900936701, "grad_norm": 4.913153171539307, "learning_rate": 3.1470952837906686e-07, "loss": 1.0693, "step": 38900 }, { "epoch": 5.5237013908600625, "grad_norm": 4.620151996612549, "learning_rate": 3.110122721081121e-07, "loss": 1.0507, "step": 38920 }, { "epoch": 5.526539880783423, "grad_norm": 4.8783111572265625, "learning_rate": 3.0733651924465977e-07, "loss": 1.0779, "step": 38940 }, { "epoch": 5.529378370706784, "grad_norm": 4.6642937660217285, "learning_rate": 3.0368227794654426e-07, "loss": 1.0591, "step": 38960 }, { "epoch": 5.532216860630145, "grad_norm": 4.8268280029296875, "learning_rate": 3.000495563238659e-07, "loss": 1.0783, "step": 38980 }, { "epoch": 5.535055350553505, "grad_norm": 4.482834339141846, "learning_rate": 2.964383624389622e-07, "loss": 1.0676, "step": 39000 }, { "epoch": 5.537893840476866, "grad_norm": 4.50515079498291, "learning_rate": 2.9284870430639325e-07, "loss": 1.0829, "step": 39020 }, { "epoch": 5.540732330400227, "grad_norm": 4.45233678817749, "learning_rate": 2.8928058989292405e-07, "loss": 1.058, "step": 39040 }, { "epoch": 5.543570820323588, "grad_norm": 4.751028060913086, "learning_rate": 2.8573402711750465e-07, "loss": 1.0978, "step": 39060 }, { "epoch": 5.546409310246949, "grad_norm": 4.63545036315918, "learning_rate": 2.822090238512554e-07, "loss": 1.0896, "step": 39080 }, { "epoch": 5.54924780017031, "grad_norm": 4.384311199188232, "learning_rate": 2.787055879174472e-07, "loss": 1.0376, "step": 39100 }, { "epoch": 5.55208629009367, "grad_norm": 4.683865070343018, "learning_rate": 2.7522372709148704e-07, "loss": 1.0626, "step": 39120 }, { "epoch": 5.554924780017031, "grad_norm": 5.097659111022949, "learning_rate": 2.7176344910089693e-07, "loss": 1.0657, "step": 39140 }, { "epoch": 5.557763269940391, "grad_norm": 4.692623615264893, "learning_rate": 2.683247616252993e-07, "loss": 1.0613, "step": 39160 }, { "epoch": 5.5606017598637525, "grad_norm": 4.834789276123047, "learning_rate": 2.6490767229639947e-07, "loss": 1.089, "step": 39180 }, { "epoch": 5.563440249787114, "grad_norm": 4.829373836517334, "learning_rate": 2.615121886979677e-07, "loss": 1.0579, "step": 39200 }, { "epoch": 5.566278739710474, "grad_norm": 4.866196155548096, "learning_rate": 2.581383183658248e-07, "loss": 1.0426, "step": 39220 }, { "epoch": 5.569117229633835, "grad_norm": 4.743305683135986, "learning_rate": 2.547860687878212e-07, "loss": 1.0984, "step": 39240 }, { "epoch": 5.571955719557195, "grad_norm": 4.778231620788574, "learning_rate": 2.514554474038289e-07, "loss": 1.0722, "step": 39260 }, { "epoch": 5.574794209480556, "grad_norm": 4.500131130218506, "learning_rate": 2.4814646160571054e-07, "loss": 1.0491, "step": 39280 }, { "epoch": 5.577632699403917, "grad_norm": 4.30881929397583, "learning_rate": 2.4485911873731837e-07, "loss": 1.0761, "step": 39300 }, { "epoch": 5.580471189327278, "grad_norm": 4.878393173217773, "learning_rate": 2.4159342609446744e-07, "loss": 1.0447, "step": 39320 }, { "epoch": 5.583309679250639, "grad_norm": 4.47359561920166, "learning_rate": 2.383493909249257e-07, "loss": 1.017, "step": 39340 }, { "epoch": 5.586148169174, "grad_norm": 4.503353595733643, "learning_rate": 2.3512702042839176e-07, "loss": 1.063, "step": 39360 }, { "epoch": 5.58898665909736, "grad_norm": 4.713834762573242, "learning_rate": 2.31926321756486e-07, "loss": 1.0527, "step": 39380 }, { "epoch": 5.591825149020721, "grad_norm": 4.518667697906494, "learning_rate": 2.2874730201272732e-07, "loss": 1.0561, "step": 39400 }, { "epoch": 5.594663638944081, "grad_norm": 4.53288459777832, "learning_rate": 2.2558996825252534e-07, "loss": 1.0607, "step": 39420 }, { "epoch": 5.5975021288674425, "grad_norm": 4.465648174285889, "learning_rate": 2.2245432748315808e-07, "loss": 1.0428, "step": 39440 }, { "epoch": 5.600340618790804, "grad_norm": 4.715522289276123, "learning_rate": 2.1934038666375667e-07, "loss": 1.0463, "step": 39460 }, { "epoch": 5.603179108714164, "grad_norm": 4.595008373260498, "learning_rate": 2.1624815270529952e-07, "loss": 1.0093, "step": 39480 }, { "epoch": 5.606017598637525, "grad_norm": 4.68831729888916, "learning_rate": 2.1317763247058032e-07, "loss": 1.078, "step": 39500 }, { "epoch": 5.608856088560886, "grad_norm": 4.743823051452637, "learning_rate": 2.1012883277421015e-07, "loss": 1.0694, "step": 39520 }, { "epoch": 5.611694578484246, "grad_norm": 4.573362827301025, "learning_rate": 2.0710176038258644e-07, "loss": 1.088, "step": 39540 }, { "epoch": 5.614533068407607, "grad_norm": 4.2699503898620605, "learning_rate": 2.04096422013893e-07, "loss": 1.0213, "step": 39560 }, { "epoch": 5.617371558330968, "grad_norm": 4.59928035736084, "learning_rate": 2.0111282433807444e-07, "loss": 1.0607, "step": 39580 }, { "epoch": 5.620210048254329, "grad_norm": 4.976457595825195, "learning_rate": 1.9815097397682615e-07, "loss": 1.0879, "step": 39600 }, { "epoch": 5.62304853817769, "grad_norm": 4.729743957519531, "learning_rate": 1.9521087750357437e-07, "loss": 1.0745, "step": 39620 }, { "epoch": 5.62588702810105, "grad_norm": 4.773126602172852, "learning_rate": 1.9229254144347177e-07, "loss": 1.0765, "step": 39640 }, { "epoch": 5.628725518024411, "grad_norm": 4.7882771492004395, "learning_rate": 1.893959722733707e-07, "loss": 1.0531, "step": 39660 }, { "epoch": 5.631564007947771, "grad_norm": 4.665550231933594, "learning_rate": 1.8652117642181998e-07, "loss": 1.0766, "step": 39680 }, { "epoch": 5.6344024978711325, "grad_norm": 5.003387451171875, "learning_rate": 1.8366816026904154e-07, "loss": 1.0782, "step": 39700 }, { "epoch": 5.637240987794494, "grad_norm": 4.622972011566162, "learning_rate": 1.808369301469215e-07, "loss": 1.0491, "step": 39720 }, { "epoch": 5.640079477717854, "grad_norm": 4.64286470413208, "learning_rate": 1.7802749233899685e-07, "loss": 1.0627, "step": 39740 }, { "epoch": 5.642917967641215, "grad_norm": 4.801409721374512, "learning_rate": 1.7523985308043557e-07, "loss": 1.0375, "step": 39760 }, { "epoch": 5.645756457564576, "grad_norm": 4.224085807800293, "learning_rate": 1.724740185580298e-07, "loss": 1.0492, "step": 39780 }, { "epoch": 5.648594947487936, "grad_norm": 4.871814250946045, "learning_rate": 1.6972999491017828e-07, "loss": 1.0974, "step": 39800 }, { "epoch": 5.651433437411297, "grad_norm": 4.610102653503418, "learning_rate": 1.670077882268717e-07, "loss": 1.0551, "step": 39820 }, { "epoch": 5.654271927334658, "grad_norm": 4.371514797210693, "learning_rate": 1.6430740454968396e-07, "loss": 1.0625, "step": 39840 }, { "epoch": 5.657110417258019, "grad_norm": 4.701549530029297, "learning_rate": 1.6162884987175332e-07, "loss": 1.0633, "step": 39860 }, { "epoch": 5.65994890718138, "grad_norm": 4.277895450592041, "learning_rate": 1.5897213013777112e-07, "loss": 1.0512, "step": 39880 }, { "epoch": 5.66278739710474, "grad_norm": 4.789766311645508, "learning_rate": 1.5633725124397314e-07, "loss": 1.0856, "step": 39900 }, { "epoch": 5.665625887028101, "grad_norm": 5.025292873382568, "learning_rate": 1.5372421903811607e-07, "loss": 1.0733, "step": 39920 }, { "epoch": 5.668464376951462, "grad_norm": 4.3951544761657715, "learning_rate": 1.5113303931947543e-07, "loss": 1.078, "step": 39940 }, { "epoch": 5.671302866874822, "grad_norm": 4.925329685211182, "learning_rate": 1.4856371783882884e-07, "loss": 1.0781, "step": 39960 }, { "epoch": 5.6741413567981835, "grad_norm": 4.595816612243652, "learning_rate": 1.4601626029843606e-07, "loss": 1.0868, "step": 39980 }, { "epoch": 5.676979846721544, "grad_norm": 4.720486640930176, "learning_rate": 1.434906723520413e-07, "loss": 1.0505, "step": 40000 }, { "epoch": 5.679818336644905, "grad_norm": 4.337492942810059, "learning_rate": 1.409869596048452e-07, "loss": 1.0658, "step": 40020 }, { "epoch": 5.682656826568266, "grad_norm": 4.602992057800293, "learning_rate": 1.38505127613503e-07, "loss": 1.0899, "step": 40040 }, { "epoch": 5.685495316491626, "grad_norm": 4.901805400848389, "learning_rate": 1.360451818861075e-07, "loss": 1.0664, "step": 40060 }, { "epoch": 5.688333806414987, "grad_norm": 4.758505344390869, "learning_rate": 1.3360712788217822e-07, "loss": 1.0737, "step": 40080 }, { "epoch": 5.6911722963383475, "grad_norm": 4.629724025726318, "learning_rate": 1.3119097101264911e-07, "loss": 1.0337, "step": 40100 }, { "epoch": 5.694010786261709, "grad_norm": 4.645726680755615, "learning_rate": 1.2879671663985516e-07, "loss": 1.0507, "step": 40120 }, { "epoch": 5.69684927618507, "grad_norm": 4.488370895385742, "learning_rate": 1.264243700775225e-07, "loss": 1.0455, "step": 40140 }, { "epoch": 5.69968776610843, "grad_norm": 4.255455017089844, "learning_rate": 1.2407393659075506e-07, "loss": 1.035, "step": 40160 }, { "epoch": 5.702526256031791, "grad_norm": 4.418965816497803, "learning_rate": 1.2174542139602785e-07, "loss": 1.0732, "step": 40180 }, { "epoch": 5.705364745955152, "grad_norm": 4.657048225402832, "learning_rate": 1.1943882966116372e-07, "loss": 1.109, "step": 40200 }, { "epoch": 5.708203235878512, "grad_norm": 4.571644306182861, "learning_rate": 1.1715416650533551e-07, "loss": 1.0819, "step": 40220 }, { "epoch": 5.7110417258018735, "grad_norm": 4.660852432250977, "learning_rate": 1.1489143699904504e-07, "loss": 1.0698, "step": 40240 }, { "epoch": 5.713880215725235, "grad_norm": 4.897495269775391, "learning_rate": 1.1265064616411858e-07, "loss": 1.066, "step": 40260 }, { "epoch": 5.716718705648595, "grad_norm": 4.33512544631958, "learning_rate": 1.1043179897368916e-07, "loss": 1.1013, "step": 40280 }, { "epoch": 5.719557195571956, "grad_norm": 4.55518913269043, "learning_rate": 1.0823490035218986e-07, "loss": 1.0812, "step": 40300 }, { "epoch": 5.722395685495316, "grad_norm": 4.666219234466553, "learning_rate": 1.0605995517534385e-07, "loss": 1.0506, "step": 40320 }, { "epoch": 5.725234175418677, "grad_norm": 4.671894550323486, "learning_rate": 1.0390696827014879e-07, "loss": 1.0922, "step": 40340 }, { "epoch": 5.728072665342038, "grad_norm": 4.181873798370361, "learning_rate": 1.0177594441487026e-07, "loss": 1.0735, "step": 40360 }, { "epoch": 5.730911155265399, "grad_norm": 4.4991350173950195, "learning_rate": 9.966688833902948e-08, "loss": 1.066, "step": 40380 }, { "epoch": 5.73374964518876, "grad_norm": 4.654384136199951, "learning_rate": 9.757980472339445e-08, "loss": 1.0453, "step": 40400 }, { "epoch": 5.73658813511212, "grad_norm": 4.782173156738281, "learning_rate": 9.55146981999644e-08, "loss": 1.1165, "step": 40420 }, { "epoch": 5.739426625035481, "grad_norm": 4.669752597808838, "learning_rate": 9.347157335196976e-08, "loss": 1.0682, "step": 40440 }, { "epoch": 5.742265114958842, "grad_norm": 4.683998107910156, "learning_rate": 9.145043471384784e-08, "loss": 1.0699, "step": 40460 }, { "epoch": 5.745103604882202, "grad_norm": 4.747989654541016, "learning_rate": 8.945128677124715e-08, "loss": 1.0542, "step": 40480 }, { "epoch": 5.7479420948055635, "grad_norm": 4.80647611618042, "learning_rate": 8.747413396100746e-08, "loss": 1.1124, "step": 40500 }, { "epoch": 5.750780584728924, "grad_norm": 4.3610310554504395, "learning_rate": 8.551898067115428e-08, "loss": 1.0659, "step": 40520 }, { "epoch": 5.753619074652285, "grad_norm": 4.445956230163574, "learning_rate": 8.358583124088659e-08, "loss": 1.0699, "step": 40540 }, { "epoch": 5.756457564575646, "grad_norm": 4.540678977966309, "learning_rate": 8.167468996057248e-08, "loss": 1.054, "step": 40560 }, { "epoch": 5.759296054499006, "grad_norm": 4.4650139808654785, "learning_rate": 7.978556107173018e-08, "loss": 1.0715, "step": 40580 }, { "epoch": 5.762134544422367, "grad_norm": 4.847330093383789, "learning_rate": 7.791844876703036e-08, "loss": 1.0462, "step": 40600 }, { "epoch": 5.764973034345728, "grad_norm": 4.661546230316162, "learning_rate": 7.60733571902772e-08, "loss": 1.0563, "step": 40620 }, { "epoch": 5.767811524269089, "grad_norm": 4.824905872344971, "learning_rate": 7.425029043640397e-08, "loss": 1.0148, "step": 40640 }, { "epoch": 5.77065001419245, "grad_norm": 4.876687526702881, "learning_rate": 7.244925255146196e-08, "loss": 1.0687, "step": 40660 }, { "epoch": 5.773488504115811, "grad_norm": 4.352469444274902, "learning_rate": 7.067024753261375e-08, "loss": 1.0645, "step": 40680 }, { "epoch": 5.776326994039171, "grad_norm": 5.080018997192383, "learning_rate": 6.891327932812109e-08, "loss": 1.0685, "step": 40700 }, { "epoch": 5.779165483962532, "grad_norm": 4.722990989685059, "learning_rate": 6.717835183734033e-08, "loss": 1.0363, "step": 40720 }, { "epoch": 5.782003973885892, "grad_norm": 4.763210773468018, "learning_rate": 6.546546891070815e-08, "loss": 1.0656, "step": 40740 }, { "epoch": 5.7848424638092535, "grad_norm": 4.880441665649414, "learning_rate": 6.377463434974141e-08, "loss": 1.1119, "step": 40760 }, { "epoch": 5.787680953732615, "grad_norm": 4.583034515380859, "learning_rate": 6.210585190701612e-08, "loss": 1.0557, "step": 40780 }, { "epoch": 5.790519443655975, "grad_norm": 4.903120994567871, "learning_rate": 6.045912528617526e-08, "loss": 1.0485, "step": 40800 }, { "epoch": 5.793357933579336, "grad_norm": 4.637136459350586, "learning_rate": 5.883445814190536e-08, "loss": 1.0812, "step": 40820 }, { "epoch": 5.796196423502696, "grad_norm": 4.762404441833496, "learning_rate": 5.723185407993659e-08, "loss": 1.0458, "step": 40840 }, { "epoch": 5.799034913426057, "grad_norm": 4.230287551879883, "learning_rate": 5.5651316657034935e-08, "loss": 1.0454, "step": 40860 }, { "epoch": 5.801873403349418, "grad_norm": 5.043659687042236, "learning_rate": 5.409284938098891e-08, "loss": 1.0686, "step": 40880 }, { "epoch": 5.804711893272779, "grad_norm": 4.565435886383057, "learning_rate": 5.2556455710610635e-08, "loss": 1.0577, "step": 40900 }, { "epoch": 5.80755038319614, "grad_norm": 4.7674241065979, "learning_rate": 5.1042139055715865e-08, "loss": 1.0705, "step": 40920 }, { "epoch": 5.810388873119501, "grad_norm": 4.503202438354492, "learning_rate": 4.954990277712957e-08, "loss": 1.0629, "step": 40940 }, { "epoch": 5.813227363042861, "grad_norm": 4.560409069061279, "learning_rate": 4.807975018666922e-08, "loss": 1.0796, "step": 40960 }, { "epoch": 5.816065852966222, "grad_norm": 4.827771186828613, "learning_rate": 4.663168454714151e-08, "loss": 1.0455, "step": 40980 }, { "epoch": 5.818904342889582, "grad_norm": 4.5242204666137695, "learning_rate": 4.520570907233235e-08, "loss": 1.0773, "step": 41000 }, { "epoch": 5.8217428328129435, "grad_norm": 4.654844284057617, "learning_rate": 4.380182692700463e-08, "loss": 1.0371, "step": 41020 }, { "epoch": 5.824581322736305, "grad_norm": 4.396212577819824, "learning_rate": 4.2420041226886035e-08, "loss": 1.0739, "step": 41040 }, { "epoch": 5.827419812659665, "grad_norm": 4.344109535217285, "learning_rate": 4.1060355038662345e-08, "loss": 1.0306, "step": 41060 }, { "epoch": 5.830258302583026, "grad_norm": 4.865748882293701, "learning_rate": 3.9722771379977485e-08, "loss": 1.0596, "step": 41080 }, { "epoch": 5.833096792506387, "grad_norm": 4.67423677444458, "learning_rate": 3.840729321941683e-08, "loss": 1.0836, "step": 41100 }, { "epoch": 5.835935282429747, "grad_norm": 4.889367580413818, "learning_rate": 3.711392347650722e-08, "loss": 1.0684, "step": 41120 }, { "epoch": 5.838773772353108, "grad_norm": 4.706118583679199, "learning_rate": 3.584266502171141e-08, "loss": 1.0674, "step": 41140 }, { "epoch": 5.841612262276469, "grad_norm": 4.500822067260742, "learning_rate": 3.459352067641475e-08, "loss": 1.0399, "step": 41160 }, { "epoch": 5.84445075219983, "grad_norm": 4.591899871826172, "learning_rate": 3.3366493212926286e-08, "loss": 1.088, "step": 41180 }, { "epoch": 5.847289242123191, "grad_norm": 4.621219635009766, "learning_rate": 3.216158535446878e-08, "loss": 1.0784, "step": 41200 }, { "epoch": 5.850127732046551, "grad_norm": 4.90480375289917, "learning_rate": 3.0978799775173155e-08, "loss": 1.0501, "step": 41220 }, { "epoch": 5.852966221969912, "grad_norm": 4.444205284118652, "learning_rate": 2.981813910007292e-08, "loss": 1.0463, "step": 41240 }, { "epoch": 5.855804711893272, "grad_norm": 4.181506156921387, "learning_rate": 2.867960590509977e-08, "loss": 1.0564, "step": 41260 }, { "epoch": 5.8586432018166335, "grad_norm": 4.733233451843262, "learning_rate": 2.756320271707469e-08, "loss": 1.0675, "step": 41280 }, { "epoch": 5.861481691739995, "grad_norm": 4.936706066131592, "learning_rate": 2.6468932013704595e-08, "loss": 1.0881, "step": 41300 }, { "epoch": 5.864320181663355, "grad_norm": 4.707117557525635, "learning_rate": 2.5396796223575716e-08, "loss": 1.0197, "step": 41320 }, { "epoch": 5.867158671586716, "grad_norm": 4.349254608154297, "learning_rate": 2.434679772615023e-08, "loss": 1.0789, "step": 41340 }, { "epoch": 5.869997161510077, "grad_norm": 4.593410968780518, "learning_rate": 2.331893885175962e-08, "loss": 1.0556, "step": 41360 }, { "epoch": 5.872835651433437, "grad_norm": 4.701520919799805, "learning_rate": 2.23132218815969e-08, "loss": 1.1051, "step": 41380 }, { "epoch": 5.875674141356798, "grad_norm": 4.662505149841309, "learning_rate": 2.1329649047716615e-08, "loss": 1.0749, "step": 41400 }, { "epoch": 5.878512631280159, "grad_norm": 4.7816338539123535, "learning_rate": 2.036822253302817e-08, "loss": 1.0292, "step": 41420 }, { "epoch": 5.88135112120352, "grad_norm": 4.727035045623779, "learning_rate": 1.942894447128585e-08, "loss": 1.0804, "step": 41440 }, { "epoch": 5.884189611126881, "grad_norm": 4.775646209716797, "learning_rate": 1.8511816947093252e-08, "loss": 1.0811, "step": 41460 }, { "epoch": 5.887028101050241, "grad_norm": 4.651393890380859, "learning_rate": 1.7616841995891087e-08, "loss": 1.0463, "step": 41480 }, { "epoch": 5.889866590973602, "grad_norm": 4.876699447631836, "learning_rate": 1.6744021603956052e-08, "loss": 1.0772, "step": 41500 }, { "epoch": 5.892705080896963, "grad_norm": 4.986159801483154, "learning_rate": 1.5893357708395286e-08, "loss": 1.0706, "step": 41520 }, { "epoch": 5.895543570820323, "grad_norm": 4.486198425292969, "learning_rate": 1.5064852197145263e-08, "loss": 1.0689, "step": 41540 }, { "epoch": 5.8983820607436845, "grad_norm": 4.947371959686279, "learning_rate": 1.4258506908959579e-08, "loss": 1.0827, "step": 41560 }, { "epoch": 5.901220550667045, "grad_norm": 4.569003105163574, "learning_rate": 1.347432363341672e-08, "loss": 1.0681, "step": 41580 }, { "epoch": 5.904059040590406, "grad_norm": 4.4745097160339355, "learning_rate": 1.2712304110903406e-08, "loss": 1.0476, "step": 41600 }, { "epoch": 5.906897530513767, "grad_norm": 4.7210283279418945, "learning_rate": 1.1972450032621263e-08, "loss": 1.0602, "step": 41620 }, { "epoch": 5.909736020437127, "grad_norm": 4.723958492279053, "learning_rate": 1.1254763040575712e-08, "loss": 1.0667, "step": 41640 }, { "epoch": 5.912574510360488, "grad_norm": 4.525557041168213, "learning_rate": 1.0559244727578189e-08, "loss": 1.0882, "step": 41660 }, { "epoch": 5.9154130002838485, "grad_norm": 4.9522600173950195, "learning_rate": 9.88589663723838e-09, "loss": 1.0822, "step": 41680 }, { "epoch": 5.91825149020721, "grad_norm": 4.40378999710083, "learning_rate": 9.234720263959774e-09, "loss": 1.0551, "step": 41700 }, { "epoch": 5.921089980130571, "grad_norm": 4.932559490203857, "learning_rate": 8.605717052942997e-09, "loss": 1.0806, "step": 41720 }, { "epoch": 5.923928470053931, "grad_norm": 4.732334136962891, "learning_rate": 7.998888400175819e-09, "loss": 1.0592, "step": 41740 }, { "epoch": 5.926766959977292, "grad_norm": 4.6784796714782715, "learning_rate": 7.414235652432045e-09, "loss": 1.0586, "step": 41760 }, { "epoch": 5.929605449900653, "grad_norm": 4.744207859039307, "learning_rate": 6.8517601072704e-09, "loss": 1.0744, "step": 41780 }, { "epoch": 5.932443939824013, "grad_norm": 4.730002403259277, "learning_rate": 6.3114630130300945e-09, "loss": 1.0491, "step": 41800 }, { "epoch": 5.9352824297473745, "grad_norm": 4.699164390563965, "learning_rate": 5.793345568828601e-09, "loss": 1.0707, "step": 41820 }, { "epoch": 5.938120919670736, "grad_norm": 4.583643436431885, "learning_rate": 5.297408924557212e-09, "loss": 1.0791, "step": 41840 }, { "epoch": 5.940959409594096, "grad_norm": 4.68289041519165, "learning_rate": 4.823654180881043e-09, "loss": 1.0588, "step": 41860 }, { "epoch": 5.943797899517457, "grad_norm": 4.344052314758301, "learning_rate": 4.3720823892345884e-09, "loss": 1.0661, "step": 41880 }, { "epoch": 5.946636389440817, "grad_norm": 4.643074035644531, "learning_rate": 3.942694551820614e-09, "loss": 1.0704, "step": 41900 }, { "epoch": 5.949474879364178, "grad_norm": 4.583273410797119, "learning_rate": 3.535491621609044e-09, "loss": 1.0198, "step": 41920 }, { "epoch": 5.952313369287539, "grad_norm": 4.577699661254883, "learning_rate": 3.150474502331413e-09, "loss": 1.0876, "step": 41940 }, { "epoch": 5.9551518592109, "grad_norm": 4.402993202209473, "learning_rate": 2.7876440484819744e-09, "loss": 1.0482, "step": 41960 }, { "epoch": 5.957990349134261, "grad_norm": 4.697798728942871, "learning_rate": 2.447001065313259e-09, "loss": 1.063, "step": 41980 }, { "epoch": 5.960828839057621, "grad_norm": 4.389227390289307, "learning_rate": 2.128546308837187e-09, "loss": 1.0808, "step": 42000 }, { "epoch": 5.963667328980982, "grad_norm": 4.866738796234131, "learning_rate": 1.8322804858217357e-09, "loss": 1.079, "step": 42020 }, { "epoch": 5.966505818904343, "grad_norm": 4.590463161468506, "learning_rate": 1.5582042537887198e-09, "loss": 1.0747, "step": 42040 }, { "epoch": 5.969344308827703, "grad_norm": 4.670779705047607, "learning_rate": 1.3063182210126813e-09, "loss": 1.1003, "step": 42060 }, { "epoch": 5.9721827987510645, "grad_norm": 4.655738830566406, "learning_rate": 1.076622946521999e-09, "loss": 1.0459, "step": 42080 }, { "epoch": 5.975021288674425, "grad_norm": 4.429340839385986, "learning_rate": 8.691189400944489e-10, "loss": 1.0679, "step": 42100 }, { "epoch": 5.977859778597786, "grad_norm": 4.772407531738281, "learning_rate": 6.838066622572026e-10, "loss": 1.104, "step": 42120 }, { "epoch": 5.980698268521147, "grad_norm": 4.665767669677734, "learning_rate": 5.206865242857184e-10, "loss": 1.0425, "step": 42140 }, { "epoch": 5.983536758444507, "grad_norm": 4.546030044555664, "learning_rate": 3.7975888820374065e-10, "loss": 1.0893, "step": 42160 }, { "epoch": 5.986375248367868, "grad_norm": 4.733661651611328, "learning_rate": 2.6102406677996907e-10, "loss": 1.0756, "step": 42180 }, { "epoch": 5.989213738291229, "grad_norm": 4.697795867919922, "learning_rate": 1.6448232353249994e-10, "loss": 1.0763, "step": 42200 }, { "epoch": 5.99205222821459, "grad_norm": 4.739811420440674, "learning_rate": 9.013387272105434e-11, "loss": 1.0784, "step": 42220 }, { "epoch": 5.994890718137951, "grad_norm": 4.439469337463379, "learning_rate": 3.797887935141908e-11, "loss": 1.0723, "step": 42240 }, { "epoch": 5.997729208061312, "grad_norm": 4.55684757232666, "learning_rate": 8.017459177667164e-12, "loss": 1.0629, "step": 42260 }, { "epoch": 6.0, "step": 42276, "total_flos": 2.1751352755916636e+18, "train_loss": 1.5761911902660926, "train_runtime": 25038.7369, "train_samples_per_second": 40.519, "train_steps_per_second": 1.688 } ], "logging_steps": 20, "max_steps": 42276, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1751352755916636e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }