starcoder2-3b-lora-apr / trainer_state.json
mqddd's picture
Upload folder using huggingface_hub
d79f62a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.7491289198606275,
"eval_steps": 500,
"global_step": 4125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013937282229965157,
"grad_norm": 0.689270555973053,
"learning_rate": 1e-05,
"loss": 3.465,
"step": 10
},
{
"epoch": 0.027874564459930314,
"grad_norm": 0.3839555084705353,
"learning_rate": 2e-05,
"loss": 3.4261,
"step": 20
},
{
"epoch": 0.041811846689895474,
"grad_norm": 0.8513675928115845,
"learning_rate": 1.9999707152644143e-05,
"loss": 3.0148,
"step": 30
},
{
"epoch": 0.05574912891986063,
"grad_norm": 0.5808806419372559,
"learning_rate": 1.9998828627728483e-05,
"loss": 2.8815,
"step": 40
},
{
"epoch": 0.06968641114982578,
"grad_norm": 0.8429257869720459,
"learning_rate": 1.9997364476707765e-05,
"loss": 2.4359,
"step": 50
},
{
"epoch": 0.08362369337979095,
"grad_norm": 9.207444190979004,
"learning_rate": 1.9995314785336534e-05,
"loss": 2.3272,
"step": 60
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.8501074314117432,
"learning_rate": 1.9992679673664136e-05,
"loss": 2.3717,
"step": 70
},
{
"epoch": 0.11149825783972125,
"grad_norm": 1.1296913623809814,
"learning_rate": 1.998945929602766e-05,
"loss": 1.9401,
"step": 80
},
{
"epoch": 0.1254355400696864,
"grad_norm": 0.7665435075759888,
"learning_rate": 1.9985653841042926e-05,
"loss": 2.0245,
"step": 90
},
{
"epoch": 0.13937282229965156,
"grad_norm": 1.1252214908599854,
"learning_rate": 1.9981263531593422e-05,
"loss": 1.847,
"step": 100
},
{
"epoch": 0.15331010452961671,
"grad_norm": 0.6236267685890198,
"learning_rate": 1.997628862481725e-05,
"loss": 1.6229,
"step": 110
},
{
"epoch": 0.1672473867595819,
"grad_norm": 0.41556432843208313,
"learning_rate": 1.9970729412092064e-05,
"loss": 1.6222,
"step": 120
},
{
"epoch": 0.18118466898954705,
"grad_norm": 1.1682895421981812,
"learning_rate": 1.9964586219018018e-05,
"loss": 1.6162,
"step": 130
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.804908037185669,
"learning_rate": 1.995785940539868e-05,
"loss": 1.5503,
"step": 140
},
{
"epoch": 0.20905923344947736,
"grad_norm": 0.5646180510520935,
"learning_rate": 1.995054936521997e-05,
"loss": 1.5357,
"step": 150
},
{
"epoch": 0.2229965156794425,
"grad_norm": 0.667831301689148,
"learning_rate": 1.994265652662707e-05,
"loss": 1.4177,
"step": 160
},
{
"epoch": 0.23693379790940766,
"grad_norm": 0.6871632933616638,
"learning_rate": 1.9934181351899365e-05,
"loss": 1.4035,
"step": 170
},
{
"epoch": 0.2508710801393728,
"grad_norm": 0.5316534638404846,
"learning_rate": 1.9925124337423356e-05,
"loss": 1.3704,
"step": 180
},
{
"epoch": 0.26480836236933797,
"grad_norm": 0.4839189946651459,
"learning_rate": 1.9915486013663595e-05,
"loss": 1.2654,
"step": 190
},
{
"epoch": 0.2787456445993031,
"grad_norm": 0.6332902908325195,
"learning_rate": 1.99052669451316e-05,
"loss": 1.341,
"step": 200
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.8837312459945679,
"learning_rate": 1.9894467730352817e-05,
"loss": 1.2951,
"step": 210
},
{
"epoch": 0.30662020905923343,
"grad_norm": 0.5074018239974976,
"learning_rate": 1.9883089001831545e-05,
"loss": 1.2118,
"step": 220
},
{
"epoch": 0.3205574912891986,
"grad_norm": 0.4832253158092499,
"learning_rate": 1.9871131426013894e-05,
"loss": 1.1841,
"step": 230
},
{
"epoch": 0.3344947735191638,
"grad_norm": 0.593585193157196,
"learning_rate": 1.9858595703248755e-05,
"loss": 1.1881,
"step": 240
},
{
"epoch": 0.34843205574912894,
"grad_norm": 2.605353832244873,
"learning_rate": 1.9845482567746783e-05,
"loss": 1.185,
"step": 250
},
{
"epoch": 0.3623693379790941,
"grad_norm": 0.5042847990989685,
"learning_rate": 1.983179278753739e-05,
"loss": 1.1611,
"step": 260
},
{
"epoch": 0.37630662020905925,
"grad_norm": 0.5455293655395508,
"learning_rate": 1.981752716442376e-05,
"loss": 1.1387,
"step": 270
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.5077016949653625,
"learning_rate": 1.9802686533935903e-05,
"loss": 1.1303,
"step": 280
},
{
"epoch": 0.40418118466898956,
"grad_norm": 1.6482897996902466,
"learning_rate": 1.9787271765281684e-05,
"loss": 1.1113,
"step": 290
},
{
"epoch": 0.4181184668989547,
"grad_norm": 0.706795871257782,
"learning_rate": 1.9771283761295966e-05,
"loss": 1.2045,
"step": 300
},
{
"epoch": 0.43205574912891986,
"grad_norm": 0.4687112867832184,
"learning_rate": 1.975472345838768e-05,
"loss": 1.1115,
"step": 310
},
{
"epoch": 0.445993031358885,
"grad_norm": 0.6432089805603027,
"learning_rate": 1.9737591826485013e-05,
"loss": 1.0392,
"step": 320
},
{
"epoch": 0.45993031358885017,
"grad_norm": 0.5330508351325989,
"learning_rate": 1.9719889868978582e-05,
"loss": 1.1082,
"step": 330
},
{
"epoch": 0.4738675958188153,
"grad_norm": 0.5292240381240845,
"learning_rate": 1.970161862266268e-05,
"loss": 1.0845,
"step": 340
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.5111907720565796,
"learning_rate": 1.968277915767454e-05,
"loss": 1.0763,
"step": 350
},
{
"epoch": 0.5017421602787456,
"grad_norm": 0.6637840270996094,
"learning_rate": 1.9663372577431663e-05,
"loss": 1.0727,
"step": 360
},
{
"epoch": 0.5156794425087108,
"grad_norm": 0.5857555270195007,
"learning_rate": 1.9643400018567195e-05,
"loss": 1.0431,
"step": 370
},
{
"epoch": 0.5296167247386759,
"grad_norm": 0.4895532429218292,
"learning_rate": 1.962286265086334e-05,
"loss": 1.0656,
"step": 380
},
{
"epoch": 0.5435540069686411,
"grad_norm": 0.4925576448440552,
"learning_rate": 1.9601761677182868e-05,
"loss": 1.0581,
"step": 390
},
{
"epoch": 0.5574912891986062,
"grad_norm": 0.49863460659980774,
"learning_rate": 1.958009833339865e-05,
"loss": 1.0236,
"step": 400
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.4993515610694885,
"learning_rate": 1.955787388832127e-05,
"loss": 1.0382,
"step": 410
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.5540180206298828,
"learning_rate": 1.953508964362473e-05,
"loss": 1.06,
"step": 420
},
{
"epoch": 0.5993031358885017,
"grad_norm": 0.47915443778038025,
"learning_rate": 1.9511746933770186e-05,
"loss": 1.0793,
"step": 430
},
{
"epoch": 0.6132404181184669,
"grad_norm": 0.931480348110199,
"learning_rate": 1.9487847125927814e-05,
"loss": 1.0125,
"step": 440
},
{
"epoch": 0.627177700348432,
"grad_norm": 0.5183358192443848,
"learning_rate": 1.946339161989672e-05,
"loss": 0.9963,
"step": 450
},
{
"epoch": 0.6411149825783972,
"grad_norm": 0.4870765507221222,
"learning_rate": 1.943838184802296e-05,
"loss": 1.0556,
"step": 460
},
{
"epoch": 0.6550522648083623,
"grad_norm": 0.5741175413131714,
"learning_rate": 1.9412819275115648e-05,
"loss": 1.0337,
"step": 470
},
{
"epoch": 0.6689895470383276,
"grad_norm": 6.292229652404785,
"learning_rate": 1.9386705398361156e-05,
"loss": 1.0487,
"step": 480
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.508897602558136,
"learning_rate": 1.9360041747235437e-05,
"loss": 1.0538,
"step": 490
},
{
"epoch": 0.6968641114982579,
"grad_norm": 2.2197482585906982,
"learning_rate": 1.9332829883414444e-05,
"loss": 0.9835,
"step": 500
},
{
"epoch": 0.710801393728223,
"grad_norm": 0.5469727516174316,
"learning_rate": 1.9305071400682644e-05,
"loss": 1.023,
"step": 510
},
{
"epoch": 0.7247386759581882,
"grad_norm": 0.5392600297927856,
"learning_rate": 1.9276767924839687e-05,
"loss": 0.9832,
"step": 520
},
{
"epoch": 0.7386759581881533,
"grad_norm": 0.5167688727378845,
"learning_rate": 1.9247921113605197e-05,
"loss": 0.9896,
"step": 530
},
{
"epoch": 0.7526132404181185,
"grad_norm": 0.622711718082428,
"learning_rate": 1.921853265652164e-05,
"loss": 0.9443,
"step": 540
},
{
"epoch": 0.7665505226480837,
"grad_norm": 0.7172455787658691,
"learning_rate": 1.9188604274855417e-05,
"loss": 1.0113,
"step": 550
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.5286630988121033,
"learning_rate": 1.9158137721496014e-05,
"loss": 1.0027,
"step": 560
},
{
"epoch": 0.794425087108014,
"grad_norm": 0.531980037689209,
"learning_rate": 1.9127134780853343e-05,
"loss": 0.9521,
"step": 570
},
{
"epoch": 0.8083623693379791,
"grad_norm": 0.5521572828292847,
"learning_rate": 1.9095597268753243e-05,
"loss": 1.006,
"step": 580
},
{
"epoch": 0.8222996515679443,
"grad_norm": 0.953914999961853,
"learning_rate": 1.9063527032331128e-05,
"loss": 0.9401,
"step": 590
},
{
"epoch": 0.8362369337979094,
"grad_norm": 0.6153273582458496,
"learning_rate": 1.9030925949923777e-05,
"loss": 0.9872,
"step": 600
},
{
"epoch": 0.8501742160278746,
"grad_norm": 0.7658360004425049,
"learning_rate": 1.899779593095935e-05,
"loss": 0.9842,
"step": 610
},
{
"epoch": 0.8641114982578397,
"grad_norm": 0.6273182034492493,
"learning_rate": 1.896413891584554e-05,
"loss": 0.9636,
"step": 620
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.5916078090667725,
"learning_rate": 1.8929956875855913e-05,
"loss": 0.9845,
"step": 630
},
{
"epoch": 0.89198606271777,
"grad_norm": 0.5227757096290588,
"learning_rate": 1.8895251813014486e-05,
"loss": 0.9813,
"step": 640
},
{
"epoch": 0.9059233449477352,
"grad_norm": 0.5434836745262146,
"learning_rate": 1.8860025759978436e-05,
"loss": 0.9522,
"step": 650
},
{
"epoch": 0.9198606271777003,
"grad_norm": 0.48904576897621155,
"learning_rate": 1.8824280779919055e-05,
"loss": 0.937,
"step": 660
},
{
"epoch": 0.9337979094076655,
"grad_norm": 0.6036517024040222,
"learning_rate": 1.8788018966400923e-05,
"loss": 0.9587,
"step": 670
},
{
"epoch": 0.9477351916376306,
"grad_norm": 0.5333548188209534,
"learning_rate": 1.8751242443259286e-05,
"loss": 0.9564,
"step": 680
},
{
"epoch": 0.9616724738675958,
"grad_norm": 0.6121916770935059,
"learning_rate": 1.8713953364475654e-05,
"loss": 0.9676,
"step": 690
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.5846276879310608,
"learning_rate": 1.8676153914051648e-05,
"loss": 0.9417,
"step": 700
},
{
"epoch": 0.9895470383275261,
"grad_norm": 0.6067516207695007,
"learning_rate": 1.8637846305881092e-05,
"loss": 0.9615,
"step": 710
},
{
"epoch": 1.0034843205574913,
"grad_norm": 0.4959181547164917,
"learning_rate": 1.859903278362034e-05,
"loss": 0.9361,
"step": 720
},
{
"epoch": 1.0174216027874565,
"grad_norm": 0.554417073726654,
"learning_rate": 1.8559715620556865e-05,
"loss": 0.984,
"step": 730
},
{
"epoch": 1.0313588850174216,
"grad_norm": 0.5190228223800659,
"learning_rate": 1.8519897119476115e-05,
"loss": 0.9571,
"step": 740
},
{
"epoch": 1.0452961672473868,
"grad_norm": 0.5573728680610657,
"learning_rate": 1.8479579612526642e-05,
"loss": 0.9324,
"step": 750
},
{
"epoch": 1.0592334494773519,
"grad_norm": 0.5803472399711609,
"learning_rate": 1.8438765461083504e-05,
"loss": 0.9274,
"step": 760
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.5444169044494629,
"learning_rate": 1.8397457055609973e-05,
"loss": 0.9278,
"step": 770
},
{
"epoch": 1.0871080139372822,
"grad_norm": 0.5003806352615356,
"learning_rate": 1.8355656815517505e-05,
"loss": 0.9392,
"step": 780
},
{
"epoch": 1.1010452961672474,
"grad_norm": 1.262714147567749,
"learning_rate": 1.8313367189024065e-05,
"loss": 0.9741,
"step": 790
},
{
"epoch": 1.1149825783972125,
"grad_norm": 0.8410710692405701,
"learning_rate": 1.8270590653010706e-05,
"loss": 0.9815,
"step": 800
},
{
"epoch": 1.1289198606271778,
"grad_norm": 0.554559051990509,
"learning_rate": 1.8227329712876525e-05,
"loss": 0.9085,
"step": 810
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.5503095388412476,
"learning_rate": 1.8183586902391905e-05,
"loss": 0.9034,
"step": 820
},
{
"epoch": 1.156794425087108,
"grad_norm": 0.5442657470703125,
"learning_rate": 1.8139364783550128e-05,
"loss": 0.9525,
"step": 830
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.5207365155220032,
"learning_rate": 1.8094665946417304e-05,
"loss": 0.9166,
"step": 840
},
{
"epoch": 1.1846689895470384,
"grad_norm": 0.48352962732315063,
"learning_rate": 1.8049493008980685e-05,
"loss": 0.9187,
"step": 850
},
{
"epoch": 1.1986062717770034,
"grad_norm": 0.4856855869293213,
"learning_rate": 1.8003848616995333e-05,
"loss": 0.9226,
"step": 860
},
{
"epoch": 1.2125435540069687,
"grad_norm": 4.280817031860352,
"learning_rate": 1.795773544382915e-05,
"loss": 0.9301,
"step": 870
},
{
"epoch": 1.2264808362369337,
"grad_norm": 0.6370670199394226,
"learning_rate": 1.7911156190306296e-05,
"loss": 0.9843,
"step": 880
},
{
"epoch": 1.240418118466899,
"grad_norm": 0.7971873879432678,
"learning_rate": 1.786411358454902e-05,
"loss": 0.9352,
"step": 890
},
{
"epoch": 1.254355400696864,
"grad_norm": 0.6326978802680969,
"learning_rate": 1.7816610381817864e-05,
"loss": 0.8951,
"step": 900
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.8231450319290161,
"learning_rate": 1.776864936435029e-05,
"loss": 0.9498,
"step": 910
},
{
"epoch": 1.2822299651567945,
"grad_norm": 0.4994203746318817,
"learning_rate": 1.7720233341197726e-05,
"loss": 0.9127,
"step": 920
},
{
"epoch": 1.2961672473867596,
"grad_norm": 0.6145898103713989,
"learning_rate": 1.7671365148061053e-05,
"loss": 0.9249,
"step": 930
},
{
"epoch": 1.3101045296167246,
"grad_norm": 0.5441964864730835,
"learning_rate": 1.7622047647124488e-05,
"loss": 0.9078,
"step": 940
},
{
"epoch": 1.32404181184669,
"grad_norm": 0.5940006375312805,
"learning_rate": 1.757228372688799e-05,
"loss": 0.8937,
"step": 950
},
{
"epoch": 1.3379790940766552,
"grad_norm": 0.6185000538825989,
"learning_rate": 1.7522076301998048e-05,
"loss": 0.8922,
"step": 960
},
{
"epoch": 1.3519163763066202,
"grad_norm": 0.6179748773574829,
"learning_rate": 1.7471428313076984e-05,
"loss": 0.8864,
"step": 970
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.6203311681747437,
"learning_rate": 1.7420342726550728e-05,
"loss": 0.9071,
"step": 980
},
{
"epoch": 1.3797909407665505,
"grad_norm": 0.4726350009441376,
"learning_rate": 1.736882253447506e-05,
"loss": 0.9225,
"step": 990
},
{
"epoch": 1.3937282229965158,
"grad_norm": 0.5029098987579346,
"learning_rate": 1.73168707543604e-05,
"loss": 0.9036,
"step": 1000
},
{
"epoch": 1.4076655052264808,
"grad_norm": 0.5293656587600708,
"learning_rate": 1.726449042899502e-05,
"loss": 0.9093,
"step": 1010
},
{
"epoch": 1.4216027874564459,
"grad_norm": 0.5243374109268188,
"learning_rate": 1.7211684626266887e-05,
"loss": 0.8831,
"step": 1020
},
{
"epoch": 1.4355400696864111,
"grad_norm": 0.5120546221733093,
"learning_rate": 1.7158456438983934e-05,
"loss": 0.9138,
"step": 1030
},
{
"epoch": 1.4494773519163764,
"grad_norm": 0.4908638596534729,
"learning_rate": 1.7104808984692946e-05,
"loss": 0.9152,
"step": 1040
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.5942572951316833,
"learning_rate": 1.705074540549695e-05,
"loss": 0.9535,
"step": 1050
},
{
"epoch": 1.4773519163763067,
"grad_norm": 0.6341879367828369,
"learning_rate": 1.699626886787119e-05,
"loss": 0.9506,
"step": 1060
},
{
"epoch": 1.4912891986062717,
"grad_norm": 0.49885329604148865,
"learning_rate": 1.6941382562477664e-05,
"loss": 0.9313,
"step": 1070
},
{
"epoch": 1.505226480836237,
"grad_norm": 0.5285991430282593,
"learning_rate": 1.688608970397825e-05,
"loss": 0.9164,
"step": 1080
},
{
"epoch": 1.519163763066202,
"grad_norm": 0.5937536358833313,
"learning_rate": 1.683039353084644e-05,
"loss": 0.8853,
"step": 1090
},
{
"epoch": 1.533101045296167,
"grad_norm": 0.5034327507019043,
"learning_rate": 1.677429730517763e-05,
"loss": 0.9081,
"step": 1100
},
{
"epoch": 1.5470383275261324,
"grad_norm": 0.48388397693634033,
"learning_rate": 1.67178043124981e-05,
"loss": 0.8786,
"step": 1110
},
{
"epoch": 1.5609756097560976,
"grad_norm": 0.6228198409080505,
"learning_rate": 1.666091786157255e-05,
"loss": 0.8607,
"step": 1120
},
{
"epoch": 1.5749128919860627,
"grad_norm": 0.4986213147640228,
"learning_rate": 1.6603641284210335e-05,
"loss": 0.8904,
"step": 1130
},
{
"epoch": 1.588850174216028,
"grad_norm": 0.4710678458213806,
"learning_rate": 1.6545977935070293e-05,
"loss": 0.8807,
"step": 1140
},
{
"epoch": 1.6027874564459932,
"grad_norm": 0.5493403673171997,
"learning_rate": 1.6487931191464293e-05,
"loss": 0.9389,
"step": 1150
},
{
"epoch": 1.6167247386759582,
"grad_norm": 0.5593530535697937,
"learning_rate": 1.642950445315941e-05,
"loss": 0.9294,
"step": 1160
},
{
"epoch": 1.6306620209059233,
"grad_norm": 0.5576480031013489,
"learning_rate": 1.6370701142178815e-05,
"loss": 0.8685,
"step": 1170
},
{
"epoch": 1.6445993031358885,
"grad_norm": 0.5916953682899475,
"learning_rate": 1.6311524702601328e-05,
"loss": 0.8794,
"step": 1180
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.49112585186958313,
"learning_rate": 1.6251978600359727e-05,
"loss": 0.8893,
"step": 1190
},
{
"epoch": 1.6724738675958188,
"grad_norm": 0.606788694858551,
"learning_rate": 1.6192066323037723e-05,
"loss": 0.9162,
"step": 1200
},
{
"epoch": 1.6864111498257839,
"grad_norm": 0.5515270829200745,
"learning_rate": 1.613179137966572e-05,
"loss": 0.9027,
"step": 1210
},
{
"epoch": 1.7003484320557491,
"grad_norm": 0.51644366979599,
"learning_rate": 1.6071157300515274e-05,
"loss": 0.9218,
"step": 1220
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.48575639724731445,
"learning_rate": 1.6010167636892338e-05,
"loss": 0.9032,
"step": 1230
},
{
"epoch": 1.7282229965156795,
"grad_norm": 0.5278819799423218,
"learning_rate": 1.594882596092926e-05,
"loss": 0.9159,
"step": 1240
},
{
"epoch": 1.7421602787456445,
"grad_norm": 0.554883599281311,
"learning_rate": 1.5887135865375552e-05,
"loss": 0.9046,
"step": 1250
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.5662369728088379,
"learning_rate": 1.58251009633875e-05,
"loss": 0.8528,
"step": 1260
},
{
"epoch": 1.770034843205575,
"grad_norm": 0.6568381786346436,
"learning_rate": 1.57627248883165e-05,
"loss": 0.8885,
"step": 1270
},
{
"epoch": 1.78397212543554,
"grad_norm": 0.7277708649635315,
"learning_rate": 1.5700011293496285e-05,
"loss": 0.9159,
"step": 1280
},
{
"epoch": 1.797909407665505,
"grad_norm": 0.5788251161575317,
"learning_rate": 1.5636963852028936e-05,
"loss": 0.9036,
"step": 1290
},
{
"epoch": 1.8118466898954704,
"grad_norm": 0.5556735396385193,
"learning_rate": 1.557358625656976e-05,
"loss": 0.9155,
"step": 1300
},
{
"epoch": 1.8257839721254356,
"grad_norm": 0.4880397319793701,
"learning_rate": 1.550988221911101e-05,
"loss": 0.8849,
"step": 1310
},
{
"epoch": 1.8397212543554007,
"grad_norm": 0.6523249745368958,
"learning_rate": 1.5445855470764467e-05,
"loss": 0.8644,
"step": 1320
},
{
"epoch": 1.8536585365853657,
"grad_norm": 0.8619920015335083,
"learning_rate": 1.5381509761542925e-05,
"loss": 0.9073,
"step": 1330
},
{
"epoch": 1.867595818815331,
"grad_norm": 0.6176061034202576,
"learning_rate": 1.5316848860140545e-05,
"loss": 0.877,
"step": 1340
},
{
"epoch": 1.8815331010452963,
"grad_norm": 0.49368295073509216,
"learning_rate": 1.5251876553712129e-05,
"loss": 0.8854,
"step": 1350
},
{
"epoch": 1.8954703832752613,
"grad_norm": 0.6543199419975281,
"learning_rate": 1.5186596647651299e-05,
"loss": 0.883,
"step": 1360
},
{
"epoch": 1.9094076655052263,
"grad_norm": 0.568365216255188,
"learning_rate": 1.512101296536764e-05,
"loss": 0.9144,
"step": 1370
},
{
"epoch": 1.9233449477351916,
"grad_norm": 0.5592637062072754,
"learning_rate": 1.5055129348062733e-05,
"loss": 0.8869,
"step": 1380
},
{
"epoch": 1.9372822299651569,
"grad_norm": 0.642049252986908,
"learning_rate": 1.4988949654505212e-05,
"loss": 0.9268,
"step": 1390
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.8612108826637268,
"learning_rate": 1.492247776080472e-05,
"loss": 0.9231,
"step": 1400
},
{
"epoch": 1.965156794425087,
"grad_norm": 0.5690594911575317,
"learning_rate": 1.4855717560184925e-05,
"loss": 0.8862,
"step": 1410
},
{
"epoch": 1.9790940766550522,
"grad_norm": 0.5545530915260315,
"learning_rate": 1.4788672962755474e-05,
"loss": 0.8777,
"step": 1420
},
{
"epoch": 1.9930313588850175,
"grad_norm": 0.5686807036399841,
"learning_rate": 1.4721347895282977e-05,
"loss": 0.867,
"step": 1430
},
{
"epoch": 2.0069686411149825,
"grad_norm": 0.49507030844688416,
"learning_rate": 1.4653746300961037e-05,
"loss": 0.8879,
"step": 1440
},
{
"epoch": 2.0209059233449476,
"grad_norm": 0.5000828504562378,
"learning_rate": 1.4585872139179284e-05,
"loss": 0.8951,
"step": 1450
},
{
"epoch": 2.034843205574913,
"grad_norm": 0.5445813536643982,
"learning_rate": 1.4517729385291479e-05,
"loss": 0.8741,
"step": 1460
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.5599672198295593,
"learning_rate": 1.4449322030382681e-05,
"loss": 0.8956,
"step": 1470
},
{
"epoch": 2.062717770034843,
"grad_norm": 0.579526424407959,
"learning_rate": 1.4380654081035492e-05,
"loss": 0.8655,
"step": 1480
},
{
"epoch": 2.076655052264808,
"grad_norm": 0.5371329188346863,
"learning_rate": 1.4311729559095391e-05,
"loss": 0.8916,
"step": 1490
},
{
"epoch": 2.0905923344947737,
"grad_norm": 0.5372903943061829,
"learning_rate": 1.424255250143518e-05,
"loss": 0.9006,
"step": 1500
},
{
"epoch": 2.1045296167247387,
"grad_norm": 0.5461590886116028,
"learning_rate": 1.4173126959718542e-05,
"loss": 0.8981,
"step": 1510
},
{
"epoch": 2.1184668989547037,
"grad_norm": 0.5336124897003174,
"learning_rate": 1.410345700016274e-05,
"loss": 0.8979,
"step": 1520
},
{
"epoch": 2.132404181184669,
"grad_norm": 0.512737512588501,
"learning_rate": 1.4033546703300465e-05,
"loss": 0.8549,
"step": 1530
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.5914519429206848,
"learning_rate": 1.3963400163740828e-05,
"loss": 0.8807,
"step": 1540
},
{
"epoch": 2.1602787456445993,
"grad_norm": 0.6203148365020752,
"learning_rate": 1.3893021489929564e-05,
"loss": 0.9025,
"step": 1550
},
{
"epoch": 2.1742160278745644,
"grad_norm": 0.47906365990638733,
"learning_rate": 1.382241480390837e-05,
"loss": 0.9091,
"step": 1560
},
{
"epoch": 2.1881533101045294,
"grad_norm": 1.1542456150054932,
"learning_rate": 1.3751584241073517e-05,
"loss": 0.8571,
"step": 1570
},
{
"epoch": 2.202090592334495,
"grad_norm": 0.778533935546875,
"learning_rate": 1.3680533949933607e-05,
"loss": 0.8534,
"step": 1580
},
{
"epoch": 2.21602787456446,
"grad_norm": 0.5771265625953674,
"learning_rate": 1.3609268091866621e-05,
"loss": 0.8709,
"step": 1590
},
{
"epoch": 2.229965156794425,
"grad_norm": 0.5153730511665344,
"learning_rate": 1.3537790840876179e-05,
"loss": 0.8865,
"step": 1600
},
{
"epoch": 2.2439024390243905,
"grad_norm": 0.5823934674263,
"learning_rate": 1.346610638334707e-05,
"loss": 0.8608,
"step": 1610
},
{
"epoch": 2.2578397212543555,
"grad_norm": 0.4887414872646332,
"learning_rate": 1.3394218917800064e-05,
"loss": 0.8661,
"step": 1620
},
{
"epoch": 2.2717770034843205,
"grad_norm": 0.5397761464118958,
"learning_rate": 1.3322132654646003e-05,
"loss": 0.8719,
"step": 1630
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.7656607627868652,
"learning_rate": 1.3249851815939197e-05,
"loss": 0.8857,
"step": 1640
},
{
"epoch": 2.2996515679442506,
"grad_norm": 0.5524553060531616,
"learning_rate": 1.3177380635130144e-05,
"loss": 0.8957,
"step": 1650
},
{
"epoch": 2.313588850174216,
"grad_norm": 0.7648917436599731,
"learning_rate": 1.3104723356817582e-05,
"loss": 0.8746,
"step": 1660
},
{
"epoch": 2.327526132404181,
"grad_norm": 0.696306049823761,
"learning_rate": 1.3031884236499877e-05,
"loss": 0.8732,
"step": 1670
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.5518249273300171,
"learning_rate": 1.2958867540325785e-05,
"loss": 0.8641,
"step": 1680
},
{
"epoch": 2.3554006968641117,
"grad_norm": 0.5839936137199402,
"learning_rate": 1.2885677544844592e-05,
"loss": 0.8317,
"step": 1690
},
{
"epoch": 2.3693379790940767,
"grad_norm": 0.5415021777153015,
"learning_rate": 1.2812318536755624e-05,
"loss": 0.8815,
"step": 1700
},
{
"epoch": 2.3832752613240418,
"grad_norm": 0.5816763639450073,
"learning_rate": 1.2738794812657194e-05,
"loss": 0.8682,
"step": 1710
},
{
"epoch": 2.397212543554007,
"grad_norm": 0.5739949941635132,
"learning_rate": 1.266511067879494e-05,
"loss": 0.8928,
"step": 1720
},
{
"epoch": 2.4111498257839723,
"grad_norm": 0.5285424590110779,
"learning_rate": 1.2591270450809612e-05,
"loss": 0.9042,
"step": 1730
},
{
"epoch": 2.4250871080139373,
"grad_norm": 0.67451012134552,
"learning_rate": 1.251727845348432e-05,
"loss": 0.9084,
"step": 1740
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.6238117218017578,
"learning_rate": 1.2443139020491216e-05,
"loss": 0.8828,
"step": 1750
},
{
"epoch": 2.4529616724738674,
"grad_norm": 0.527727484703064,
"learning_rate": 1.236885649413768e-05,
"loss": 0.8348,
"step": 1760
},
{
"epoch": 2.466898954703833,
"grad_norm": 0.6208236813545227,
"learning_rate": 1.2294435225112005e-05,
"loss": 0.8976,
"step": 1770
},
{
"epoch": 2.480836236933798,
"grad_norm": 0.6415792107582092,
"learning_rate": 1.2219879572228555e-05,
"loss": 0.853,
"step": 1780
},
{
"epoch": 2.494773519163763,
"grad_norm": 0.5672902464866638,
"learning_rate": 1.2145193902172496e-05,
"loss": 0.8624,
"step": 1790
},
{
"epoch": 2.508710801393728,
"grad_norm": 0.5251675248146057,
"learning_rate": 1.2070382589244026e-05,
"loss": 0.8919,
"step": 1800
},
{
"epoch": 2.5226480836236935,
"grad_norm": 0.6049728989601135,
"learning_rate": 1.199545001510218e-05,
"loss": 0.8417,
"step": 1810
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.5997565984725952,
"learning_rate": 1.1920400568508201e-05,
"loss": 0.8831,
"step": 1820
},
{
"epoch": 2.5505226480836236,
"grad_norm": 0.5272901058197021,
"learning_rate": 1.184523864506849e-05,
"loss": 0.8773,
"step": 1830
},
{
"epoch": 2.564459930313589,
"grad_norm": 0.567862331867218,
"learning_rate": 1.1769968646977148e-05,
"loss": 0.8595,
"step": 1840
},
{
"epoch": 2.578397212543554,
"grad_norm": 0.5373286008834839,
"learning_rate": 1.1694594982758164e-05,
"loss": 0.8896,
"step": 1850
},
{
"epoch": 2.592334494773519,
"grad_norm": 0.5112028121948242,
"learning_rate": 1.161912206700719e-05,
"loss": 0.8882,
"step": 1860
},
{
"epoch": 2.6062717770034842,
"grad_norm": 0.4764540493488312,
"learning_rate": 1.154355432013299e-05,
"loss": 0.8381,
"step": 1870
},
{
"epoch": 2.6202090592334493,
"grad_norm": 0.7286739349365234,
"learning_rate": 1.1467896168098533e-05,
"loss": 0.8502,
"step": 1880
},
{
"epoch": 2.6341463414634148,
"grad_norm": 0.5751617550849915,
"learning_rate": 1.1392152042161774e-05,
"loss": 0.8631,
"step": 1890
},
{
"epoch": 2.64808362369338,
"grad_norm": 0.5550952553749084,
"learning_rate": 1.1316326378616121e-05,
"loss": 0.9055,
"step": 1900
},
{
"epoch": 2.662020905923345,
"grad_norm": 0.5390698909759521,
"learning_rate": 1.1240423618530578e-05,
"loss": 0.8586,
"step": 1910
},
{
"epoch": 2.6759581881533103,
"grad_norm": 0.5401940941810608,
"learning_rate": 1.1164448207489673e-05,
"loss": 0.873,
"step": 1920
},
{
"epoch": 2.6898954703832754,
"grad_norm": 0.7127025723457336,
"learning_rate": 1.1088404595333046e-05,
"loss": 0.8753,
"step": 1930
},
{
"epoch": 2.7038327526132404,
"grad_norm": 0.6411701440811157,
"learning_rate": 1.101229723589485e-05,
"loss": 0.8814,
"step": 1940
},
{
"epoch": 2.7177700348432055,
"grad_norm": 0.5122844576835632,
"learning_rate": 1.0936130586742881e-05,
"loss": 0.8509,
"step": 1950
},
{
"epoch": 2.7317073170731705,
"grad_norm": 2.784543514251709,
"learning_rate": 1.0859909108917497e-05,
"loss": 0.8112,
"step": 1960
},
{
"epoch": 2.745644599303136,
"grad_norm": 0.533532977104187,
"learning_rate": 1.0783637266670348e-05,
"loss": 0.8479,
"step": 1970
},
{
"epoch": 2.759581881533101,
"grad_norm": 0.5365408062934875,
"learning_rate": 1.0707319527202902e-05,
"loss": 0.8281,
"step": 1980
},
{
"epoch": 2.773519163763066,
"grad_norm": 0.45295801758766174,
"learning_rate": 1.0630960360404793e-05,
"loss": 0.9046,
"step": 1990
},
{
"epoch": 2.7874564459930316,
"grad_norm": 0.656039834022522,
"learning_rate": 1.0554564238592051e-05,
"loss": 0.8305,
"step": 2000
},
{
"epoch": 2.8013937282229966,
"grad_norm": 0.5675934553146362,
"learning_rate": 1.0478135636245122e-05,
"loss": 0.8633,
"step": 2010
},
{
"epoch": 2.8153310104529616,
"grad_norm": 0.5480667948722839,
"learning_rate": 1.0401679029746828e-05,
"loss": 0.8756,
"step": 2020
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.5900964736938477,
"learning_rate": 1.0325198897120183e-05,
"loss": 0.8737,
"step": 2030
},
{
"epoch": 2.8432055749128917,
"grad_norm": 0.688490092754364,
"learning_rate": 1.0248699717766107e-05,
"loss": 0.8425,
"step": 2040
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.5785161256790161,
"learning_rate": 1.0172185972201082e-05,
"loss": 0.902,
"step": 2050
},
{
"epoch": 2.8710801393728222,
"grad_norm": 0.5259153246879578,
"learning_rate": 1.0095662141794725e-05,
"loss": 0.8793,
"step": 2060
},
{
"epoch": 2.8850174216027873,
"grad_norm": 0.5888857841491699,
"learning_rate": 1.0019132708507307e-05,
"loss": 0.8665,
"step": 2070
},
{
"epoch": 2.8989547038327528,
"grad_norm": 0.6237362027168274,
"learning_rate": 9.94260215462727e-06,
"loss": 0.8647,
"step": 2080
},
{
"epoch": 2.912891986062718,
"grad_norm": 0.5640315413475037,
"learning_rate": 9.866074962508684e-06,
"loss": 0.8659,
"step": 2090
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.4334649443626404,
"learning_rate": 9.789555614308721e-06,
"loss": 0.8566,
"step": 2100
},
{
"epoch": 2.940766550522648,
"grad_norm": 0.5068169832229614,
"learning_rate": 9.713048591725138e-06,
"loss": 0.8712,
"step": 2110
},
{
"epoch": 2.9547038327526134,
"grad_norm": 0.5684682726860046,
"learning_rate": 9.63655837573379e-06,
"loss": 0.8217,
"step": 2120
},
{
"epoch": 2.9686411149825784,
"grad_norm": 0.6925583481788635,
"learning_rate": 9.560089446326175e-06,
"loss": 0.8675,
"step": 2130
},
{
"epoch": 2.9825783972125435,
"grad_norm": 0.7573685050010681,
"learning_rate": 9.483646282247056e-06,
"loss": 0.8369,
"step": 2140
},
{
"epoch": 2.996515679442509,
"grad_norm": 0.598778486251831,
"learning_rate": 9.407233360732119e-06,
"loss": 0.8434,
"step": 2150
},
{
"epoch": 3.010452961672474,
"grad_norm": 114.71971893310547,
"learning_rate": 9.330855157245776e-06,
"loss": 0.8841,
"step": 2160
},
{
"epoch": 3.024390243902439,
"grad_norm": 0.5424976348876953,
"learning_rate": 9.254516145219006e-06,
"loss": 0.8653,
"step": 2170
},
{
"epoch": 3.038327526132404,
"grad_norm": 0.48183199763298035,
"learning_rate": 9.17822079578738e-06,
"loss": 0.8402,
"step": 2180
},
{
"epoch": 3.052264808362369,
"grad_norm": 0.5667704343795776,
"learning_rate": 9.101973577529164e-06,
"loss": 0.8357,
"step": 2190
},
{
"epoch": 3.0662020905923346,
"grad_norm": 0.5843963027000427,
"learning_rate": 9.025778956203611e-06,
"loss": 0.8538,
"step": 2200
},
{
"epoch": 3.0801393728222997,
"grad_norm": 0.5097166895866394,
"learning_rate": 8.949641394489399e-06,
"loss": 0.8208,
"step": 2210
},
{
"epoch": 3.0940766550522647,
"grad_norm": 0.5178412795066833,
"learning_rate": 8.873565351723249e-06,
"loss": 0.9026,
"step": 2220
},
{
"epoch": 3.10801393728223,
"grad_norm": 0.6717800498008728,
"learning_rate": 8.79755528363876e-06,
"loss": 0.8002,
"step": 2230
},
{
"epoch": 3.1219512195121952,
"grad_norm": 0.81369549036026,
"learning_rate": 8.721615642105417e-06,
"loss": 0.8757,
"step": 2240
},
{
"epoch": 3.1358885017421603,
"grad_norm": 0.574155867099762,
"learning_rate": 8.645750874867876e-06,
"loss": 0.8411,
"step": 2250
},
{
"epoch": 3.1498257839721253,
"grad_norm": 0.4896714985370636,
"learning_rate": 8.56996542528542e-06,
"loss": 0.8671,
"step": 2260
},
{
"epoch": 3.1637630662020904,
"grad_norm": 0.5032427906990051,
"learning_rate": 8.494263732071772e-06,
"loss": 0.8521,
"step": 2270
},
{
"epoch": 3.177700348432056,
"grad_norm": 0.5645169615745544,
"learning_rate": 8.418650229035054e-06,
"loss": 0.8407,
"step": 2280
},
{
"epoch": 3.191637630662021,
"grad_norm": 0.5049313306808472,
"learning_rate": 8.343129344818162e-06,
"loss": 0.853,
"step": 2290
},
{
"epoch": 3.205574912891986,
"grad_norm": 0.5244989991188049,
"learning_rate": 8.267705502639342e-06,
"loss": 0.8546,
"step": 2300
},
{
"epoch": 3.2195121951219514,
"grad_norm": 0.6323722004890442,
"learning_rate": 8.192383120033147e-06,
"loss": 0.8408,
"step": 2310
},
{
"epoch": 3.2334494773519165,
"grad_norm": 0.5894546508789062,
"learning_rate": 8.117166608591693e-06,
"loss": 0.865,
"step": 2320
},
{
"epoch": 3.2473867595818815,
"grad_norm": 1.284786343574524,
"learning_rate": 8.042060373706275e-06,
"loss": 0.8596,
"step": 2330
},
{
"epoch": 3.2613240418118465,
"grad_norm": 0.5084718465805054,
"learning_rate": 7.967068814309359e-06,
"loss": 0.8377,
"step": 2340
},
{
"epoch": 3.275261324041812,
"grad_norm": 0.5845734477043152,
"learning_rate": 7.892196322616912e-06,
"loss": 0.8597,
"step": 2350
},
{
"epoch": 3.289198606271777,
"grad_norm": 0.5465214252471924,
"learning_rate": 7.817447283871187e-06,
"loss": 0.8584,
"step": 2360
},
{
"epoch": 3.303135888501742,
"grad_norm": 0.5865809917449951,
"learning_rate": 7.742826076083848e-06,
"loss": 0.843,
"step": 2370
},
{
"epoch": 3.317073170731707,
"grad_norm": 0.455839604139328,
"learning_rate": 7.668337069779577e-06,
"loss": 0.8599,
"step": 2380
},
{
"epoch": 3.3310104529616726,
"grad_norm": 0.48517608642578125,
"learning_rate": 7.593984627740075e-06,
"loss": 0.8592,
"step": 2390
},
{
"epoch": 3.3449477351916377,
"grad_norm": 0.5980703830718994,
"learning_rate": 7.519773104748562e-06,
"loss": 0.8673,
"step": 2400
},
{
"epoch": 3.3588850174216027,
"grad_norm": 0.5855985879898071,
"learning_rate": 7.4457068473346836e-06,
"loss": 0.8518,
"step": 2410
},
{
"epoch": 3.3728222996515678,
"grad_norm": 0.4601869583129883,
"learning_rate": 7.371790193519979e-06,
"loss": 0.8457,
"step": 2420
},
{
"epoch": 3.3867595818815333,
"grad_norm": 0.5327535271644592,
"learning_rate": 7.298027472563768e-06,
"loss": 0.8373,
"step": 2430
},
{
"epoch": 3.4006968641114983,
"grad_norm": 0.5465139746665955,
"learning_rate": 7.224423004709607e-06,
"loss": 0.8515,
"step": 2440
},
{
"epoch": 3.4146341463414633,
"grad_norm": 0.5567154884338379,
"learning_rate": 7.1509811009322574e-06,
"loss": 0.8541,
"step": 2450
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.5773440599441528,
"learning_rate": 7.077706062685181e-06,
"loss": 0.849,
"step": 2460
},
{
"epoch": 3.442508710801394,
"grad_norm": 0.5186436176300049,
"learning_rate": 7.004602181648626e-06,
"loss": 0.8857,
"step": 2470
},
{
"epoch": 3.456445993031359,
"grad_norm": 0.647905707359314,
"learning_rate": 6.931673739478235e-06,
"loss": 0.8486,
"step": 2480
},
{
"epoch": 3.470383275261324,
"grad_norm": 0.7109155058860779,
"learning_rate": 6.858925007554308e-06,
"loss": 0.8703,
"step": 2490
},
{
"epoch": 3.484320557491289,
"grad_norm": 0.5457651019096375,
"learning_rate": 6.786360246731595e-06,
"loss": 0.8494,
"step": 2500
},
{
"epoch": 3.4982578397212545,
"grad_norm": 0.49579504132270813,
"learning_rate": 6.713983707089773e-06,
"loss": 0.848,
"step": 2510
},
{
"epoch": 3.5121951219512195,
"grad_norm": 0.472918838262558,
"learning_rate": 6.641799627684481e-06,
"loss": 0.8633,
"step": 2520
},
{
"epoch": 3.5261324041811846,
"grad_norm": 0.6405051946640015,
"learning_rate": 6.569812236299089e-06,
"loss": 0.8672,
"step": 2530
},
{
"epoch": 3.54006968641115,
"grad_norm": 0.5040938258171082,
"learning_rate": 6.498025749197036e-06,
"loss": 0.847,
"step": 2540
},
{
"epoch": 3.554006968641115,
"grad_norm": 0.5406576991081238,
"learning_rate": 6.426444370874906e-06,
"loss": 0.8291,
"step": 2550
},
{
"epoch": 3.56794425087108,
"grad_norm": 0.47771602869033813,
"learning_rate": 6.355072293816178e-06,
"loss": 0.8522,
"step": 2560
},
{
"epoch": 3.581881533101045,
"grad_norm": 0.5669821500778198,
"learning_rate": 6.283913698245659e-06,
"loss": 0.8316,
"step": 2570
},
{
"epoch": 3.59581881533101,
"grad_norm": 0.6127913594245911,
"learning_rate": 6.212972751884663e-06,
"loss": 0.8686,
"step": 2580
},
{
"epoch": 3.6097560975609757,
"grad_norm": 0.5714460015296936,
"learning_rate": 6.142253609706898e-06,
"loss": 0.8493,
"step": 2590
},
{
"epoch": 3.6236933797909407,
"grad_norm": 0.5862120389938354,
"learning_rate": 6.0717604136951315e-06,
"loss": 0.8962,
"step": 2600
},
{
"epoch": 3.637630662020906,
"grad_norm": 0.5323344469070435,
"learning_rate": 6.001497292598566e-06,
"loss": 0.8615,
"step": 2610
},
{
"epoch": 3.6515679442508713,
"grad_norm": 0.5613053441047668,
"learning_rate": 5.931468361691053e-06,
"loss": 0.8823,
"step": 2620
},
{
"epoch": 3.6655052264808363,
"grad_norm": 0.512625515460968,
"learning_rate": 5.861677722530037e-06,
"loss": 0.8505,
"step": 2630
},
{
"epoch": 3.6794425087108014,
"grad_norm": 0.5254554152488708,
"learning_rate": 5.792129462716355e-06,
"loss": 0.8456,
"step": 2640
},
{
"epoch": 3.6933797909407664,
"grad_norm": 0.5453774929046631,
"learning_rate": 5.722827655654801e-06,
"loss": 0.862,
"step": 2650
},
{
"epoch": 3.7073170731707314,
"grad_norm": 0.5125150680541992,
"learning_rate": 5.653776360315562e-06,
"loss": 0.8497,
"step": 2660
},
{
"epoch": 3.721254355400697,
"grad_norm": 0.5420352816581726,
"learning_rate": 5.584979620996491e-06,
"loss": 0.8507,
"step": 2670
},
{
"epoch": 3.735191637630662,
"grad_norm": 0.6987497210502625,
"learning_rate": 5.516441467086231e-06,
"loss": 0.8596,
"step": 2680
},
{
"epoch": 3.749128919860627,
"grad_norm": 0.6294048428535461,
"learning_rate": 5.448165912828214e-06,
"loss": 0.8402,
"step": 2690
},
{
"epoch": 3.7630662020905925,
"grad_norm": 0.4914676547050476,
"learning_rate": 5.380156957085536e-06,
"loss": 0.8544,
"step": 2700
},
{
"epoch": 3.7770034843205575,
"grad_norm": 0.46825504302978516,
"learning_rate": 5.312418583106784e-06,
"loss": 0.8307,
"step": 2710
},
{
"epoch": 3.7909407665505226,
"grad_norm": 0.6263434290885925,
"learning_rate": 5.244954758292691e-06,
"loss": 0.8472,
"step": 2720
},
{
"epoch": 3.8048780487804876,
"grad_norm": 0.6025940179824829,
"learning_rate": 5.177769433963801e-06,
"loss": 0.8388,
"step": 2730
},
{
"epoch": 3.818815331010453,
"grad_norm": 0.6129311919212341,
"learning_rate": 5.110866545129031e-06,
"loss": 0.8647,
"step": 2740
},
{
"epoch": 3.832752613240418,
"grad_norm": 0.5434401035308838,
"learning_rate": 5.044250010255202e-06,
"loss": 0.8224,
"step": 2750
},
{
"epoch": 3.846689895470383,
"grad_norm": 0.5361849069595337,
"learning_rate": 4.97792373103753e-06,
"loss": 0.8677,
"step": 2760
},
{
"epoch": 3.8606271777003487,
"grad_norm": 0.5294119715690613,
"learning_rate": 4.911891592171113e-06,
"loss": 0.8471,
"step": 2770
},
{
"epoch": 3.8745644599303137,
"grad_norm": 0.3936956524848938,
"learning_rate": 4.846157461123411e-06,
"loss": 0.8718,
"step": 2780
},
{
"epoch": 3.8885017421602788,
"grad_norm": 0.5664070844650269,
"learning_rate": 4.780725187907707e-06,
"loss": 0.8424,
"step": 2790
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.6031394600868225,
"learning_rate": 4.715598604857648e-06,
"loss": 0.8469,
"step": 2800
},
{
"epoch": 3.916376306620209,
"grad_norm": 0.5560880899429321,
"learning_rate": 4.65078152640276e-06,
"loss": 0.8626,
"step": 2810
},
{
"epoch": 3.9303135888501743,
"grad_norm": 0.5566712021827698,
"learning_rate": 4.586277748845055e-06,
"loss": 0.9251,
"step": 2820
},
{
"epoch": 3.9442508710801394,
"grad_norm": 0.5782540440559387,
"learning_rate": 4.5220910501366635e-06,
"loss": 0.8321,
"step": 2830
},
{
"epoch": 3.9581881533101044,
"grad_norm": 0.5216631889343262,
"learning_rate": 4.458225189658598e-06,
"loss": 0.8293,
"step": 2840
},
{
"epoch": 3.97212543554007,
"grad_norm": 0.511968195438385,
"learning_rate": 4.3946839080005236e-06,
"loss": 0.8238,
"step": 2850
},
{
"epoch": 3.986062717770035,
"grad_norm": 0.49393701553344727,
"learning_rate": 4.331470926741707e-06,
"loss": 0.885,
"step": 2860
},
{
"epoch": 4.0,
"grad_norm": 0.4580581486225128,
"learning_rate": 4.268589948233034e-06,
"loss": 0.8487,
"step": 2870
},
{
"epoch": 4.013937282229965,
"grad_norm": 0.4895112216472626,
"learning_rate": 4.2060446553801585e-06,
"loss": 0.8222,
"step": 2880
},
{
"epoch": 4.02787456445993,
"grad_norm": 0.5034216046333313,
"learning_rate": 4.143838711427808e-06,
"loss": 0.8645,
"step": 2890
},
{
"epoch": 4.041811846689895,
"grad_norm": 0.5985310673713684,
"learning_rate": 4.0819757597452246e-06,
"loss": 0.8652,
"step": 2900
},
{
"epoch": 4.055749128919861,
"grad_norm": 0.4828563332557678,
"learning_rate": 4.020459423612777e-06,
"loss": 0.8602,
"step": 2910
},
{
"epoch": 4.069686411149826,
"grad_norm": 0.48318225145339966,
"learning_rate": 3.959293306009734e-06,
"loss": 0.8541,
"step": 2920
},
{
"epoch": 4.083623693379791,
"grad_norm": 0.5043054223060608,
"learning_rate": 3.89848098940326e-06,
"loss": 0.857,
"step": 2930
},
{
"epoch": 4.097560975609756,
"grad_norm": 0.5322110652923584,
"learning_rate": 3.838026035538581e-06,
"loss": 0.8419,
"step": 2940
},
{
"epoch": 4.111498257839721,
"grad_norm": 0.7655691504478455,
"learning_rate": 3.7779319852303766e-06,
"loss": 0.8551,
"step": 2950
},
{
"epoch": 4.125435540069686,
"grad_norm": 0.5095774531364441,
"learning_rate": 3.718202358155384e-06,
"loss": 0.838,
"step": 2960
},
{
"epoch": 4.139372822299651,
"grad_norm": 0.5320903658866882,
"learning_rate": 3.658840652646287e-06,
"loss": 0.8044,
"step": 2970
},
{
"epoch": 4.153310104529616,
"grad_norm": 0.5706301927566528,
"learning_rate": 3.5998503454867807e-06,
"loss": 0.858,
"step": 2980
},
{
"epoch": 4.167247386759582,
"grad_norm": 1.014062523841858,
"learning_rate": 3.5412348917079507e-06,
"loss": 0.8739,
"step": 2990
},
{
"epoch": 4.181184668989547,
"grad_norm": 0.5381621718406677,
"learning_rate": 3.4829977243859414e-06,
"loss": 0.8082,
"step": 3000
},
{
"epoch": 4.195121951219512,
"grad_norm": 0.48279663920402527,
"learning_rate": 3.425142254440835e-06,
"loss": 0.8335,
"step": 3010
},
{
"epoch": 4.209059233449477,
"grad_norm": 0.6682676076889038,
"learning_rate": 3.367671870436915e-06,
"loss": 0.8484,
"step": 3020
},
{
"epoch": 4.2229965156794425,
"grad_norm": 0.5164760947227478,
"learning_rate": 3.310589938384179e-06,
"loss": 0.8228,
"step": 3030
},
{
"epoch": 4.2369337979094075,
"grad_norm": 0.5330147743225098,
"learning_rate": 3.253899801541206e-06,
"loss": 0.8475,
"step": 3040
},
{
"epoch": 4.2508710801393725,
"grad_norm": 0.5183762907981873,
"learning_rate": 3.197604780219323e-06,
"loss": 0.8228,
"step": 3050
},
{
"epoch": 4.264808362369338,
"grad_norm": 0.5862233638763428,
"learning_rate": 3.1417081715881623e-06,
"loss": 0.8419,
"step": 3060
},
{
"epoch": 4.2787456445993035,
"grad_norm": 0.5445356369018555,
"learning_rate": 3.0862132494825325e-06,
"loss": 0.875,
"step": 3070
},
{
"epoch": 4.2926829268292686,
"grad_norm": 0.5905585885047913,
"learning_rate": 3.0311232642106768e-06,
"loss": 0.8548,
"step": 3080
},
{
"epoch": 4.306620209059234,
"grad_norm": 0.5876056551933289,
"learning_rate": 2.976441442363893e-06,
"loss": 0.8812,
"step": 3090
},
{
"epoch": 4.320557491289199,
"grad_norm": 0.5916198492050171,
"learning_rate": 2.922170986627573e-06,
"loss": 0.8289,
"step": 3100
},
{
"epoch": 4.334494773519164,
"grad_norm": 0.555949866771698,
"learning_rate": 2.8683150755936107e-06,
"loss": 0.8822,
"step": 3110
},
{
"epoch": 4.348432055749129,
"grad_norm": 0.5823965668678284,
"learning_rate": 2.8148768635742286e-06,
"loss": 0.8308,
"step": 3120
},
{
"epoch": 4.362369337979094,
"grad_norm": 0.6472144722938538,
"learning_rate": 2.761859480417255e-06,
"loss": 0.8368,
"step": 3130
},
{
"epoch": 4.376306620209059,
"grad_norm": 0.5416210293769836,
"learning_rate": 2.7092660313227748e-06,
"loss": 0.8655,
"step": 3140
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.5069013833999634,
"learning_rate": 2.6570995966612945e-06,
"loss": 0.8657,
"step": 3150
},
{
"epoch": 4.40418118466899,
"grad_norm": 0.4444841146469116,
"learning_rate": 2.605363231793302e-06,
"loss": 0.8362,
"step": 3160
},
{
"epoch": 4.418118466898955,
"grad_norm": 0.5552143454551697,
"learning_rate": 2.554059966890332e-06,
"loss": 0.8027,
"step": 3170
},
{
"epoch": 4.43205574912892,
"grad_norm": 0.7829400897026062,
"learning_rate": 2.503192806757474e-06,
"loss": 0.8351,
"step": 3180
},
{
"epoch": 4.445993031358885,
"grad_norm": 0.477295458316803,
"learning_rate": 2.4527647306574e-06,
"loss": 0.8102,
"step": 3190
},
{
"epoch": 4.45993031358885,
"grad_norm": 0.5426877737045288,
"learning_rate": 2.402778692135861e-06,
"loss": 0.8406,
"step": 3200
},
{
"epoch": 4.473867595818815,
"grad_norm": 0.5670856237411499,
"learning_rate": 2.353237618848695e-06,
"loss": 0.8258,
"step": 3210
},
{
"epoch": 4.487804878048781,
"grad_norm": 0.4924805462360382,
"learning_rate": 2.304144412390367e-06,
"loss": 0.8303,
"step": 3220
},
{
"epoch": 4.501742160278746,
"grad_norm": 0.5142589807510376,
"learning_rate": 2.255501948124017e-06,
"loss": 0.8714,
"step": 3230
},
{
"epoch": 4.515679442508711,
"grad_norm": 0.546807587146759,
"learning_rate": 2.207313075013059e-06,
"loss": 0.8221,
"step": 3240
},
{
"epoch": 4.529616724738676,
"grad_norm": 0.5798578858375549,
"learning_rate": 2.1595806154542965e-06,
"loss": 0.8625,
"step": 3250
},
{
"epoch": 4.543554006968641,
"grad_norm": 0.567733108997345,
"learning_rate": 2.112307365112657e-06,
"loss": 0.8242,
"step": 3260
},
{
"epoch": 4.557491289198606,
"grad_norm": 0.5294705033302307,
"learning_rate": 2.065496092757403e-06,
"loss": 0.8721,
"step": 3270
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.49266737699508667,
"learning_rate": 2.019149540100005e-06,
"loss": 0.8576,
"step": 3280
},
{
"epoch": 4.585365853658536,
"grad_norm": 0.5400304198265076,
"learning_rate": 1.973270421633543e-06,
"loss": 0.8899,
"step": 3290
},
{
"epoch": 4.599303135888501,
"grad_norm": 0.5392309427261353,
"learning_rate": 1.927861424473726e-06,
"loss": 0.8227,
"step": 3300
},
{
"epoch": 4.613240418118467,
"grad_norm": 0.5385839343070984,
"learning_rate": 1.882925208201498e-06,
"loss": 0.838,
"step": 3310
},
{
"epoch": 4.627177700348432,
"grad_norm": 0.5418098568916321,
"learning_rate": 1.8384644047072864e-06,
"loss": 0.8612,
"step": 3320
},
{
"epoch": 4.641114982578397,
"grad_norm": 0.5207215547561646,
"learning_rate": 1.7944816180368408e-06,
"loss": 0.8356,
"step": 3330
},
{
"epoch": 4.655052264808362,
"grad_norm": 0.9076542854309082,
"learning_rate": 1.7509794242387135e-06,
"loss": 0.8224,
"step": 3340
},
{
"epoch": 4.668989547038327,
"grad_norm": 0.4973151385784149,
"learning_rate": 1.7079603712133908e-06,
"loss": 0.8678,
"step": 3350
},
{
"epoch": 4.682926829268292,
"grad_norm": 0.5431068539619446,
"learning_rate": 1.6654269785640608e-06,
"loss": 0.8467,
"step": 3360
},
{
"epoch": 4.696864111498257,
"grad_norm": 0.4680764079093933,
"learning_rate": 1.623381737449038e-06,
"loss": 0.8364,
"step": 3370
},
{
"epoch": 4.710801393728223,
"grad_norm": 0.507692813873291,
"learning_rate": 1.5818271104358574e-06,
"loss": 0.8854,
"step": 3380
},
{
"epoch": 4.724738675958188,
"grad_norm": 0.758124828338623,
"learning_rate": 1.5407655313570525e-06,
"loss": 0.8534,
"step": 3390
},
{
"epoch": 4.7386759581881535,
"grad_norm": 0.6623209118843079,
"learning_rate": 1.5001994051675894e-06,
"loss": 0.8814,
"step": 3400
},
{
"epoch": 4.7526132404181185,
"grad_norm": 0.5073679089546204,
"learning_rate": 1.4601311078040304e-06,
"loss": 0.8457,
"step": 3410
},
{
"epoch": 4.7665505226480835,
"grad_norm": 0.500339150428772,
"learning_rate": 1.4205629860453641e-06,
"loss": 0.842,
"step": 3420
},
{
"epoch": 4.780487804878049,
"grad_norm": 0.8517163395881653,
"learning_rate": 1.3814973573755518e-06,
"loss": 0.8982,
"step": 3430
},
{
"epoch": 4.794425087108014,
"grad_norm": 0.5409610867500305,
"learning_rate": 1.3429365098478087e-06,
"loss": 0.8492,
"step": 3440
},
{
"epoch": 4.80836236933798,
"grad_norm": 0.5923708081245422,
"learning_rate": 1.3048827019505828e-06,
"loss": 0.8548,
"step": 3450
},
{
"epoch": 4.822299651567945,
"grad_norm": 0.4563385844230652,
"learning_rate": 1.2673381624752813e-06,
"loss": 0.8518,
"step": 3460
},
{
"epoch": 4.83623693379791,
"grad_norm": 0.89119553565979,
"learning_rate": 1.2303050903857195e-06,
"loss": 0.8355,
"step": 3470
},
{
"epoch": 4.850174216027875,
"grad_norm": 0.5999810695648193,
"learning_rate": 1.1937856546893533e-06,
"loss": 0.8347,
"step": 3480
},
{
"epoch": 4.86411149825784,
"grad_norm": 0.6495084166526794,
"learning_rate": 1.1577819943102132e-06,
"loss": 0.7981,
"step": 3490
},
{
"epoch": 4.878048780487805,
"grad_norm": 0.5115063190460205,
"learning_rate": 1.122296217963651e-06,
"loss": 0.8321,
"step": 3500
},
{
"epoch": 4.89198606271777,
"grad_norm": 0.5369818806648254,
"learning_rate": 1.0873304040328193e-06,
"loss": 0.8499,
"step": 3510
},
{
"epoch": 4.905923344947735,
"grad_norm": 0.523055374622345,
"learning_rate": 1.052886600446954e-06,
"loss": 0.8561,
"step": 3520
},
{
"epoch": 4.9198606271777,
"grad_norm": 0.6084771156311035,
"learning_rate": 1.0189668245614092e-06,
"loss": 0.854,
"step": 3530
},
{
"epoch": 4.933797909407666,
"grad_norm": 0.4877120852470398,
"learning_rate": 9.855730630395244e-07,
"loss": 0.8161,
"step": 3540
},
{
"epoch": 4.947735191637631,
"grad_norm": 0.6254660487174988,
"learning_rate": 9.52707271736254e-07,
"loss": 0.8705,
"step": 3550
},
{
"epoch": 4.961672473867596,
"grad_norm": 0.6138275861740112,
"learning_rate": 9.203713755836108e-07,
"loss": 0.8599,
"step": 3560
},
{
"epoch": 4.975609756097561,
"grad_norm": 0.5839621424674988,
"learning_rate": 8.885672684779345e-07,
"loss": 0.8536,
"step": 3570
},
{
"epoch": 4.989547038327526,
"grad_norm": 0.544059157371521,
"learning_rate": 8.572968131689585e-07,
"loss": 0.8536,
"step": 3580
},
{
"epoch": 5.003484320557491,
"grad_norm": 0.5101696848869324,
"learning_rate": 8.265618411507148e-07,
"loss": 0.84,
"step": 3590
},
{
"epoch": 5.017421602787456,
"grad_norm": 0.5106877684593201,
"learning_rate": 7.963641525542564e-07,
"loss": 0.8752,
"step": 3600
},
{
"epoch": 5.031358885017422,
"grad_norm": 0.5784628391265869,
"learning_rate": 7.667055160422432e-07,
"loss": 0.8417,
"step": 3610
},
{
"epoch": 5.045296167247387,
"grad_norm": 0.5501140356063843,
"learning_rate": 7.375876687053252e-07,
"loss": 0.8473,
"step": 3620
},
{
"epoch": 5.059233449477352,
"grad_norm": 0.4966902434825897,
"learning_rate": 7.090123159604234e-07,
"loss": 0.8414,
"step": 3630
},
{
"epoch": 5.073170731707317,
"grad_norm": 0.769673764705658,
"learning_rate": 6.809811314508386e-07,
"loss": 0.8604,
"step": 3640
},
{
"epoch": 5.087108013937282,
"grad_norm": 0.55353182554245,
"learning_rate": 6.534957569482214e-07,
"loss": 0.8601,
"step": 3650
},
{
"epoch": 5.101045296167247,
"grad_norm": 0.6018425822257996,
"learning_rate": 6.265578022564233e-07,
"loss": 0.8661,
"step": 3660
},
{
"epoch": 5.114982578397212,
"grad_norm": 0.534132182598114,
"learning_rate": 6.001688451172027e-07,
"loss": 0.8218,
"step": 3670
},
{
"epoch": 5.128919860627177,
"grad_norm": 0.5793606042861938,
"learning_rate": 5.743304311178289e-07,
"loss": 0.8399,
"step": 3680
},
{
"epoch": 5.142857142857143,
"grad_norm": 0.6463719010353088,
"learning_rate": 5.490440736005397e-07,
"loss": 0.8249,
"step": 3690
},
{
"epoch": 5.156794425087108,
"grad_norm": 0.5509739518165588,
"learning_rate": 5.24311253573927e-07,
"loss": 0.8125,
"step": 3700
},
{
"epoch": 5.170731707317073,
"grad_norm": 0.4920913279056549,
"learning_rate": 5.001334196261776e-07,
"loss": 0.8701,
"step": 3710
},
{
"epoch": 5.184668989547038,
"grad_norm": 0.5054136514663696,
"learning_rate": 4.765119878402424e-07,
"loss": 0.8548,
"step": 3720
},
{
"epoch": 5.198606271777003,
"grad_norm": 0.5016888380050659,
"learning_rate": 4.5344834171088594e-07,
"loss": 0.837,
"step": 3730
},
{
"epoch": 5.2125435540069684,
"grad_norm": 0.5839513540267944,
"learning_rate": 4.309438320636705e-07,
"loss": 0.8781,
"step": 3740
},
{
"epoch": 5.2264808362369335,
"grad_norm": 0.5476316213607788,
"learning_rate": 4.089997769758225e-07,
"loss": 0.8616,
"step": 3750
},
{
"epoch": 5.2404181184668985,
"grad_norm": 0.562765896320343,
"learning_rate": 3.876174616990402e-07,
"loss": 0.8614,
"step": 3760
},
{
"epoch": 5.2543554006968645,
"grad_norm": 0.5283138751983643,
"learning_rate": 3.6679813858422673e-07,
"loss": 0.8013,
"step": 3770
},
{
"epoch": 5.2682926829268295,
"grad_norm": 0.4689981937408447,
"learning_rate": 3.46543027008126e-07,
"loss": 0.7839,
"step": 3780
},
{
"epoch": 5.2822299651567945,
"grad_norm": 0.46892431378364563,
"learning_rate": 3.2685331330190916e-07,
"loss": 0.8268,
"step": 3790
},
{
"epoch": 5.29616724738676,
"grad_norm": 0.5027751326560974,
"learning_rate": 3.0773015068169876e-07,
"loss": 0.8209,
"step": 3800
},
{
"epoch": 5.310104529616725,
"grad_norm": 0.6267147660255432,
"learning_rate": 2.891746591810152e-07,
"loss": 0.8121,
"step": 3810
},
{
"epoch": 5.32404181184669,
"grad_norm": 0.5063154697418213,
"learning_rate": 2.7118792558518237e-07,
"loss": 0.8622,
"step": 3820
},
{
"epoch": 5.337979094076655,
"grad_norm": 0.5375659465789795,
"learning_rate": 2.5377100336767547e-07,
"loss": 0.8033,
"step": 3830
},
{
"epoch": 5.351916376306621,
"grad_norm": 0.5449223518371582,
"learning_rate": 2.3692491262841788e-07,
"loss": 0.848,
"step": 3840
},
{
"epoch": 5.365853658536586,
"grad_norm": 0.5025774240493774,
"learning_rate": 2.206506400340369e-07,
"loss": 0.8403,
"step": 3850
},
{
"epoch": 5.379790940766551,
"grad_norm": 0.4367560148239136,
"learning_rate": 2.0494913876007105e-07,
"loss": 0.8539,
"step": 3860
},
{
"epoch": 5.393728222996516,
"grad_norm": 0.5829946994781494,
"learning_rate": 1.8982132843514577e-07,
"loss": 0.8371,
"step": 3870
},
{
"epoch": 5.407665505226481,
"grad_norm": 0.5280970931053162,
"learning_rate": 1.752680950871144e-07,
"loss": 0.8705,
"step": 3880
},
{
"epoch": 5.421602787456446,
"grad_norm": 0.5541146993637085,
"learning_rate": 1.6129029109115401e-07,
"loss": 0.8514,
"step": 3890
},
{
"epoch": 5.435540069686411,
"grad_norm": 0.5944886207580566,
"learning_rate": 1.4788873511985656e-07,
"loss": 0.8545,
"step": 3900
},
{
"epoch": 5.449477351916376,
"grad_norm": 0.5219863653182983,
"learning_rate": 1.350642120952661e-07,
"loss": 0.9,
"step": 3910
},
{
"epoch": 5.463414634146342,
"grad_norm": 0.5505541563034058,
"learning_rate": 1.2281747314291437e-07,
"loss": 0.8239,
"step": 3920
},
{
"epoch": 5.477351916376307,
"grad_norm": 0.551377534866333,
"learning_rate": 1.1114923554782608e-07,
"loss": 0.8817,
"step": 3930
},
{
"epoch": 5.491289198606272,
"grad_norm": 0.5536546111106873,
"learning_rate": 1.0006018271250695e-07,
"loss": 0.8719,
"step": 3940
},
{
"epoch": 5.505226480836237,
"grad_norm": 0.48071563243865967,
"learning_rate": 8.955096411691566e-08,
"loss": 0.8517,
"step": 3950
},
{
"epoch": 5.519163763066202,
"grad_norm": 0.5239782929420471,
"learning_rate": 7.962219528042991e-08,
"loss": 0.8284,
"step": 3960
},
{
"epoch": 5.533101045296167,
"grad_norm": 0.5262070298194885,
"learning_rate": 7.027445772578856e-08,
"loss": 0.8277,
"step": 3970
},
{
"epoch": 5.547038327526132,
"grad_norm": 0.5303363800048828,
"learning_rate": 6.150829894503662e-08,
"loss": 0.8648,
"step": 3980
},
{
"epoch": 5.560975609756097,
"grad_norm": 0.5235099196434021,
"learning_rate": 5.332423236745765e-08,
"loss": 0.8722,
"step": 3990
},
{
"epoch": 5.574912891986063,
"grad_norm": 0.5281161665916443,
"learning_rate": 4.5722737329505495e-08,
"loss": 0.8452,
"step": 4000
},
{
"epoch": 5.588850174216028,
"grad_norm": 0.6809967756271362,
"learning_rate": 3.870425904672237e-08,
"loss": 0.8571,
"step": 4010
},
{
"epoch": 5.602787456445993,
"grad_norm": 0.5919767618179321,
"learning_rate": 3.22692085876708e-08,
"loss": 0.8392,
"step": 4020
},
{
"epoch": 5.616724738675958,
"grad_norm": 0.5929062962532043,
"learning_rate": 2.6417962849852875e-08,
"loss": 0.7991,
"step": 4030
},
{
"epoch": 5.630662020905923,
"grad_norm": 85.85006713867188,
"learning_rate": 2.1150864537636817e-08,
"loss": 0.8357,
"step": 4040
},
{
"epoch": 5.644599303135888,
"grad_norm": 0.5508357286453247,
"learning_rate": 1.646822214218524e-08,
"loss": 0.8502,
"step": 4050
},
{
"epoch": 5.658536585365853,
"grad_norm": 0.5149642825126648,
"learning_rate": 1.2370309923388501e-08,
"loss": 0.8546,
"step": 4060
},
{
"epoch": 5.672473867595819,
"grad_norm": 0.601134181022644,
"learning_rate": 8.857367893796431e-09,
"loss": 0.8809,
"step": 4070
},
{
"epoch": 5.686411149825784,
"grad_norm": 0.6303636431694031,
"learning_rate": 5.929601804566254e-09,
"loss": 0.8678,
"step": 4080
},
{
"epoch": 5.700348432055749,
"grad_norm": 0.5462765097618103,
"learning_rate": 3.5871831334099992e-09,
"loss": 0.843,
"step": 4090
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.6144183278083801,
"learning_rate": 1.8302490745503166e-09,
"loss": 0.8146,
"step": 4100
},
{
"epoch": 5.7282229965156795,
"grad_norm": 0.5314708352088928,
"learning_rate": 6.589025306869002e-10,
"loss": 0.8237,
"step": 4110
},
{
"epoch": 5.7421602787456445,
"grad_norm": 0.56773841381073,
"learning_rate": 7.321210696464853e-11,
"loss": 0.8358,
"step": 4120
}
],
"logging_steps": 10,
"max_steps": 4125,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.88632668766208e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}