{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.7491289198606275, "eval_steps": 500, "global_step": 4125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013937282229965157, "grad_norm": 0.689270555973053, "learning_rate": 1e-05, "loss": 3.465, "step": 10 }, { "epoch": 0.027874564459930314, "grad_norm": 0.3839555084705353, "learning_rate": 2e-05, "loss": 3.4261, "step": 20 }, { "epoch": 0.041811846689895474, "grad_norm": 0.8513675928115845, "learning_rate": 1.9999707152644143e-05, "loss": 3.0148, "step": 30 }, { "epoch": 0.05574912891986063, "grad_norm": 0.5808806419372559, "learning_rate": 1.9998828627728483e-05, "loss": 2.8815, "step": 40 }, { "epoch": 0.06968641114982578, "grad_norm": 0.8429257869720459, "learning_rate": 1.9997364476707765e-05, "loss": 2.4359, "step": 50 }, { "epoch": 0.08362369337979095, "grad_norm": 9.207444190979004, "learning_rate": 1.9995314785336534e-05, "loss": 2.3272, "step": 60 }, { "epoch": 0.0975609756097561, "grad_norm": 0.8501074314117432, "learning_rate": 1.9992679673664136e-05, "loss": 2.3717, "step": 70 }, { "epoch": 0.11149825783972125, "grad_norm": 1.1296913623809814, "learning_rate": 1.998945929602766e-05, "loss": 1.9401, "step": 80 }, { "epoch": 0.1254355400696864, "grad_norm": 0.7665435075759888, "learning_rate": 1.9985653841042926e-05, "loss": 2.0245, "step": 90 }, { "epoch": 0.13937282229965156, "grad_norm": 1.1252214908599854, "learning_rate": 1.9981263531593422e-05, "loss": 1.847, "step": 100 }, { "epoch": 0.15331010452961671, "grad_norm": 0.6236267685890198, "learning_rate": 1.997628862481725e-05, "loss": 1.6229, "step": 110 }, { "epoch": 0.1672473867595819, "grad_norm": 0.41556432843208313, "learning_rate": 1.9970729412092064e-05, "loss": 1.6222, "step": 120 }, { "epoch": 0.18118466898954705, "grad_norm": 1.1682895421981812, "learning_rate": 1.9964586219018018e-05, "loss": 1.6162, "step": 130 }, { "epoch": 0.1951219512195122, "grad_norm": 0.804908037185669, "learning_rate": 1.995785940539868e-05, "loss": 1.5503, "step": 140 }, { "epoch": 0.20905923344947736, "grad_norm": 0.5646180510520935, "learning_rate": 1.995054936521997e-05, "loss": 1.5357, "step": 150 }, { "epoch": 0.2229965156794425, "grad_norm": 0.667831301689148, "learning_rate": 1.994265652662707e-05, "loss": 1.4177, "step": 160 }, { "epoch": 0.23693379790940766, "grad_norm": 0.6871632933616638, "learning_rate": 1.9934181351899365e-05, "loss": 1.4035, "step": 170 }, { "epoch": 0.2508710801393728, "grad_norm": 0.5316534638404846, "learning_rate": 1.9925124337423356e-05, "loss": 1.3704, "step": 180 }, { "epoch": 0.26480836236933797, "grad_norm": 0.4839189946651459, "learning_rate": 1.9915486013663595e-05, "loss": 1.2654, "step": 190 }, { "epoch": 0.2787456445993031, "grad_norm": 0.6332902908325195, "learning_rate": 1.99052669451316e-05, "loss": 1.341, "step": 200 }, { "epoch": 0.2926829268292683, "grad_norm": 0.8837312459945679, "learning_rate": 1.9894467730352817e-05, "loss": 1.2951, "step": 210 }, { "epoch": 0.30662020905923343, "grad_norm": 0.5074018239974976, "learning_rate": 1.9883089001831545e-05, "loss": 1.2118, "step": 220 }, { "epoch": 0.3205574912891986, "grad_norm": 0.4832253158092499, "learning_rate": 1.9871131426013894e-05, "loss": 1.1841, "step": 230 }, { "epoch": 0.3344947735191638, "grad_norm": 0.593585193157196, "learning_rate": 1.9858595703248755e-05, "loss": 1.1881, "step": 240 }, { "epoch": 0.34843205574912894, "grad_norm": 2.605353832244873, "learning_rate": 1.9845482567746783e-05, "loss": 1.185, "step": 250 }, { "epoch": 0.3623693379790941, "grad_norm": 0.5042847990989685, "learning_rate": 1.983179278753739e-05, "loss": 1.1611, "step": 260 }, { "epoch": 0.37630662020905925, "grad_norm": 0.5455293655395508, "learning_rate": 1.981752716442376e-05, "loss": 1.1387, "step": 270 }, { "epoch": 0.3902439024390244, "grad_norm": 0.5077016949653625, "learning_rate": 1.9802686533935903e-05, "loss": 1.1303, "step": 280 }, { "epoch": 0.40418118466898956, "grad_norm": 1.6482897996902466, "learning_rate": 1.9787271765281684e-05, "loss": 1.1113, "step": 290 }, { "epoch": 0.4181184668989547, "grad_norm": 0.706795871257782, "learning_rate": 1.9771283761295966e-05, "loss": 1.2045, "step": 300 }, { "epoch": 0.43205574912891986, "grad_norm": 0.4687112867832184, "learning_rate": 1.975472345838768e-05, "loss": 1.1115, "step": 310 }, { "epoch": 0.445993031358885, "grad_norm": 0.6432089805603027, "learning_rate": 1.9737591826485013e-05, "loss": 1.0392, "step": 320 }, { "epoch": 0.45993031358885017, "grad_norm": 0.5330508351325989, "learning_rate": 1.9719889868978582e-05, "loss": 1.1082, "step": 330 }, { "epoch": 0.4738675958188153, "grad_norm": 0.5292240381240845, "learning_rate": 1.970161862266268e-05, "loss": 1.0845, "step": 340 }, { "epoch": 0.4878048780487805, "grad_norm": 0.5111907720565796, "learning_rate": 1.968277915767454e-05, "loss": 1.0763, "step": 350 }, { "epoch": 0.5017421602787456, "grad_norm": 0.6637840270996094, "learning_rate": 1.9663372577431663e-05, "loss": 1.0727, "step": 360 }, { "epoch": 0.5156794425087108, "grad_norm": 0.5857555270195007, "learning_rate": 1.9643400018567195e-05, "loss": 1.0431, "step": 370 }, { "epoch": 0.5296167247386759, "grad_norm": 0.4895532429218292, "learning_rate": 1.962286265086334e-05, "loss": 1.0656, "step": 380 }, { "epoch": 0.5435540069686411, "grad_norm": 0.4925576448440552, "learning_rate": 1.9601761677182868e-05, "loss": 1.0581, "step": 390 }, { "epoch": 0.5574912891986062, "grad_norm": 0.49863460659980774, "learning_rate": 1.958009833339865e-05, "loss": 1.0236, "step": 400 }, { "epoch": 0.5714285714285714, "grad_norm": 0.4993515610694885, "learning_rate": 1.955787388832127e-05, "loss": 1.0382, "step": 410 }, { "epoch": 0.5853658536585366, "grad_norm": 0.5540180206298828, "learning_rate": 1.953508964362473e-05, "loss": 1.06, "step": 420 }, { "epoch": 0.5993031358885017, "grad_norm": 0.47915443778038025, "learning_rate": 1.9511746933770186e-05, "loss": 1.0793, "step": 430 }, { "epoch": 0.6132404181184669, "grad_norm": 0.931480348110199, "learning_rate": 1.9487847125927814e-05, "loss": 1.0125, "step": 440 }, { "epoch": 0.627177700348432, "grad_norm": 0.5183358192443848, "learning_rate": 1.946339161989672e-05, "loss": 0.9963, "step": 450 }, { "epoch": 0.6411149825783972, "grad_norm": 0.4870765507221222, "learning_rate": 1.943838184802296e-05, "loss": 1.0556, "step": 460 }, { "epoch": 0.6550522648083623, "grad_norm": 0.5741175413131714, "learning_rate": 1.9412819275115648e-05, "loss": 1.0337, "step": 470 }, { "epoch": 0.6689895470383276, "grad_norm": 6.292229652404785, "learning_rate": 1.9386705398361156e-05, "loss": 1.0487, "step": 480 }, { "epoch": 0.6829268292682927, "grad_norm": 0.508897602558136, "learning_rate": 1.9360041747235437e-05, "loss": 1.0538, "step": 490 }, { "epoch": 0.6968641114982579, "grad_norm": 2.2197482585906982, "learning_rate": 1.9332829883414444e-05, "loss": 0.9835, "step": 500 }, { "epoch": 0.710801393728223, "grad_norm": 0.5469727516174316, "learning_rate": 1.9305071400682644e-05, "loss": 1.023, "step": 510 }, { "epoch": 0.7247386759581882, "grad_norm": 0.5392600297927856, "learning_rate": 1.9276767924839687e-05, "loss": 0.9832, "step": 520 }, { "epoch": 0.7386759581881533, "grad_norm": 0.5167688727378845, "learning_rate": 1.9247921113605197e-05, "loss": 0.9896, "step": 530 }, { "epoch": 0.7526132404181185, "grad_norm": 0.622711718082428, "learning_rate": 1.921853265652164e-05, "loss": 0.9443, "step": 540 }, { "epoch": 0.7665505226480837, "grad_norm": 0.7172455787658691, "learning_rate": 1.9188604274855417e-05, "loss": 1.0113, "step": 550 }, { "epoch": 0.7804878048780488, "grad_norm": 0.5286630988121033, "learning_rate": 1.9158137721496014e-05, "loss": 1.0027, "step": 560 }, { "epoch": 0.794425087108014, "grad_norm": 0.531980037689209, "learning_rate": 1.9127134780853343e-05, "loss": 0.9521, "step": 570 }, { "epoch": 0.8083623693379791, "grad_norm": 0.5521572828292847, "learning_rate": 1.9095597268753243e-05, "loss": 1.006, "step": 580 }, { "epoch": 0.8222996515679443, "grad_norm": 0.953914999961853, "learning_rate": 1.9063527032331128e-05, "loss": 0.9401, "step": 590 }, { "epoch": 0.8362369337979094, "grad_norm": 0.6153273582458496, "learning_rate": 1.9030925949923777e-05, "loss": 0.9872, "step": 600 }, { "epoch": 0.8501742160278746, "grad_norm": 0.7658360004425049, "learning_rate": 1.899779593095935e-05, "loss": 0.9842, "step": 610 }, { "epoch": 0.8641114982578397, "grad_norm": 0.6273182034492493, "learning_rate": 1.896413891584554e-05, "loss": 0.9636, "step": 620 }, { "epoch": 0.8780487804878049, "grad_norm": 0.5916078090667725, "learning_rate": 1.8929956875855913e-05, "loss": 0.9845, "step": 630 }, { "epoch": 0.89198606271777, "grad_norm": 0.5227757096290588, "learning_rate": 1.8895251813014486e-05, "loss": 0.9813, "step": 640 }, { "epoch": 0.9059233449477352, "grad_norm": 0.5434836745262146, "learning_rate": 1.8860025759978436e-05, "loss": 0.9522, "step": 650 }, { "epoch": 0.9198606271777003, "grad_norm": 0.48904576897621155, "learning_rate": 1.8824280779919055e-05, "loss": 0.937, "step": 660 }, { "epoch": 0.9337979094076655, "grad_norm": 0.6036517024040222, "learning_rate": 1.8788018966400923e-05, "loss": 0.9587, "step": 670 }, { "epoch": 0.9477351916376306, "grad_norm": 0.5333548188209534, "learning_rate": 1.8751242443259286e-05, "loss": 0.9564, "step": 680 }, { "epoch": 0.9616724738675958, "grad_norm": 0.6121916770935059, "learning_rate": 1.8713953364475654e-05, "loss": 0.9676, "step": 690 }, { "epoch": 0.975609756097561, "grad_norm": 0.5846276879310608, "learning_rate": 1.8676153914051648e-05, "loss": 0.9417, "step": 700 }, { "epoch": 0.9895470383275261, "grad_norm": 0.6067516207695007, "learning_rate": 1.8637846305881092e-05, "loss": 0.9615, "step": 710 }, { "epoch": 1.0034843205574913, "grad_norm": 0.4959181547164917, "learning_rate": 1.859903278362034e-05, "loss": 0.9361, "step": 720 }, { "epoch": 1.0174216027874565, "grad_norm": 0.554417073726654, "learning_rate": 1.8559715620556865e-05, "loss": 0.984, "step": 730 }, { "epoch": 1.0313588850174216, "grad_norm": 0.5190228223800659, "learning_rate": 1.8519897119476115e-05, "loss": 0.9571, "step": 740 }, { "epoch": 1.0452961672473868, "grad_norm": 0.5573728680610657, "learning_rate": 1.8479579612526642e-05, "loss": 0.9324, "step": 750 }, { "epoch": 1.0592334494773519, "grad_norm": 0.5803472399711609, "learning_rate": 1.8438765461083504e-05, "loss": 0.9274, "step": 760 }, { "epoch": 1.0731707317073171, "grad_norm": 0.5444169044494629, "learning_rate": 1.8397457055609973e-05, "loss": 0.9278, "step": 770 }, { "epoch": 1.0871080139372822, "grad_norm": 0.5003806352615356, "learning_rate": 1.8355656815517505e-05, "loss": 0.9392, "step": 780 }, { "epoch": 1.1010452961672474, "grad_norm": 1.262714147567749, "learning_rate": 1.8313367189024065e-05, "loss": 0.9741, "step": 790 }, { "epoch": 1.1149825783972125, "grad_norm": 0.8410710692405701, "learning_rate": 1.8270590653010706e-05, "loss": 0.9815, "step": 800 }, { "epoch": 1.1289198606271778, "grad_norm": 0.554559051990509, "learning_rate": 1.8227329712876525e-05, "loss": 0.9085, "step": 810 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5503095388412476, "learning_rate": 1.8183586902391905e-05, "loss": 0.9034, "step": 820 }, { "epoch": 1.156794425087108, "grad_norm": 0.5442657470703125, "learning_rate": 1.8139364783550128e-05, "loss": 0.9525, "step": 830 }, { "epoch": 1.170731707317073, "grad_norm": 0.5207365155220032, "learning_rate": 1.8094665946417304e-05, "loss": 0.9166, "step": 840 }, { "epoch": 1.1846689895470384, "grad_norm": 0.48352962732315063, "learning_rate": 1.8049493008980685e-05, "loss": 0.9187, "step": 850 }, { "epoch": 1.1986062717770034, "grad_norm": 0.4856855869293213, "learning_rate": 1.8003848616995333e-05, "loss": 0.9226, "step": 860 }, { "epoch": 1.2125435540069687, "grad_norm": 4.280817031860352, "learning_rate": 1.795773544382915e-05, "loss": 0.9301, "step": 870 }, { "epoch": 1.2264808362369337, "grad_norm": 0.6370670199394226, "learning_rate": 1.7911156190306296e-05, "loss": 0.9843, "step": 880 }, { "epoch": 1.240418118466899, "grad_norm": 0.7971873879432678, "learning_rate": 1.786411358454902e-05, "loss": 0.9352, "step": 890 }, { "epoch": 1.254355400696864, "grad_norm": 0.6326978802680969, "learning_rate": 1.7816610381817864e-05, "loss": 0.8951, "step": 900 }, { "epoch": 1.2682926829268293, "grad_norm": 0.8231450319290161, "learning_rate": 1.776864936435029e-05, "loss": 0.9498, "step": 910 }, { "epoch": 1.2822299651567945, "grad_norm": 0.4994203746318817, "learning_rate": 1.7720233341197726e-05, "loss": 0.9127, "step": 920 }, { "epoch": 1.2961672473867596, "grad_norm": 0.6145898103713989, "learning_rate": 1.7671365148061053e-05, "loss": 0.9249, "step": 930 }, { "epoch": 1.3101045296167246, "grad_norm": 0.5441964864730835, "learning_rate": 1.7622047647124488e-05, "loss": 0.9078, "step": 940 }, { "epoch": 1.32404181184669, "grad_norm": 0.5940006375312805, "learning_rate": 1.757228372688799e-05, "loss": 0.8937, "step": 950 }, { "epoch": 1.3379790940766552, "grad_norm": 0.6185000538825989, "learning_rate": 1.7522076301998048e-05, "loss": 0.8922, "step": 960 }, { "epoch": 1.3519163763066202, "grad_norm": 0.6179748773574829, "learning_rate": 1.7471428313076984e-05, "loss": 0.8864, "step": 970 }, { "epoch": 1.3658536585365852, "grad_norm": 0.6203311681747437, "learning_rate": 1.7420342726550728e-05, "loss": 0.9071, "step": 980 }, { "epoch": 1.3797909407665505, "grad_norm": 0.4726350009441376, "learning_rate": 1.736882253447506e-05, "loss": 0.9225, "step": 990 }, { "epoch": 1.3937282229965158, "grad_norm": 0.5029098987579346, "learning_rate": 1.73168707543604e-05, "loss": 0.9036, "step": 1000 }, { "epoch": 1.4076655052264808, "grad_norm": 0.5293656587600708, "learning_rate": 1.726449042899502e-05, "loss": 0.9093, "step": 1010 }, { "epoch": 1.4216027874564459, "grad_norm": 0.5243374109268188, "learning_rate": 1.7211684626266887e-05, "loss": 0.8831, "step": 1020 }, { "epoch": 1.4355400696864111, "grad_norm": 0.5120546221733093, "learning_rate": 1.7158456438983934e-05, "loss": 0.9138, "step": 1030 }, { "epoch": 1.4494773519163764, "grad_norm": 0.4908638596534729, "learning_rate": 1.7104808984692946e-05, "loss": 0.9152, "step": 1040 }, { "epoch": 1.4634146341463414, "grad_norm": 0.5942572951316833, "learning_rate": 1.705074540549695e-05, "loss": 0.9535, "step": 1050 }, { "epoch": 1.4773519163763067, "grad_norm": 0.6341879367828369, "learning_rate": 1.699626886787119e-05, "loss": 0.9506, "step": 1060 }, { "epoch": 1.4912891986062717, "grad_norm": 0.49885329604148865, "learning_rate": 1.6941382562477664e-05, "loss": 0.9313, "step": 1070 }, { "epoch": 1.505226480836237, "grad_norm": 0.5285991430282593, "learning_rate": 1.688608970397825e-05, "loss": 0.9164, "step": 1080 }, { "epoch": 1.519163763066202, "grad_norm": 0.5937536358833313, "learning_rate": 1.683039353084644e-05, "loss": 0.8853, "step": 1090 }, { "epoch": 1.533101045296167, "grad_norm": 0.5034327507019043, "learning_rate": 1.677429730517763e-05, "loss": 0.9081, "step": 1100 }, { "epoch": 1.5470383275261324, "grad_norm": 0.48388397693634033, "learning_rate": 1.67178043124981e-05, "loss": 0.8786, "step": 1110 }, { "epoch": 1.5609756097560976, "grad_norm": 0.6228198409080505, "learning_rate": 1.666091786157255e-05, "loss": 0.8607, "step": 1120 }, { "epoch": 1.5749128919860627, "grad_norm": 0.4986213147640228, "learning_rate": 1.6603641284210335e-05, "loss": 0.8904, "step": 1130 }, { "epoch": 1.588850174216028, "grad_norm": 0.4710678458213806, "learning_rate": 1.6545977935070293e-05, "loss": 0.8807, "step": 1140 }, { "epoch": 1.6027874564459932, "grad_norm": 0.5493403673171997, "learning_rate": 1.6487931191464293e-05, "loss": 0.9389, "step": 1150 }, { "epoch": 1.6167247386759582, "grad_norm": 0.5593530535697937, "learning_rate": 1.642950445315941e-05, "loss": 0.9294, "step": 1160 }, { "epoch": 1.6306620209059233, "grad_norm": 0.5576480031013489, "learning_rate": 1.6370701142178815e-05, "loss": 0.8685, "step": 1170 }, { "epoch": 1.6445993031358885, "grad_norm": 0.5916953682899475, "learning_rate": 1.6311524702601328e-05, "loss": 0.8794, "step": 1180 }, { "epoch": 1.6585365853658538, "grad_norm": 0.49112585186958313, "learning_rate": 1.6251978600359727e-05, "loss": 0.8893, "step": 1190 }, { "epoch": 1.6724738675958188, "grad_norm": 0.606788694858551, "learning_rate": 1.6192066323037723e-05, "loss": 0.9162, "step": 1200 }, { "epoch": 1.6864111498257839, "grad_norm": 0.5515270829200745, "learning_rate": 1.613179137966572e-05, "loss": 0.9027, "step": 1210 }, { "epoch": 1.7003484320557491, "grad_norm": 0.51644366979599, "learning_rate": 1.6071157300515274e-05, "loss": 0.9218, "step": 1220 }, { "epoch": 1.7142857142857144, "grad_norm": 0.48575639724731445, "learning_rate": 1.6010167636892338e-05, "loss": 0.9032, "step": 1230 }, { "epoch": 1.7282229965156795, "grad_norm": 0.5278819799423218, "learning_rate": 1.594882596092926e-05, "loss": 0.9159, "step": 1240 }, { "epoch": 1.7421602787456445, "grad_norm": 0.554883599281311, "learning_rate": 1.5887135865375552e-05, "loss": 0.9046, "step": 1250 }, { "epoch": 1.7560975609756098, "grad_norm": 0.5662369728088379, "learning_rate": 1.58251009633875e-05, "loss": 0.8528, "step": 1260 }, { "epoch": 1.770034843205575, "grad_norm": 0.6568381786346436, "learning_rate": 1.57627248883165e-05, "loss": 0.8885, "step": 1270 }, { "epoch": 1.78397212543554, "grad_norm": 0.7277708649635315, "learning_rate": 1.5700011293496285e-05, "loss": 0.9159, "step": 1280 }, { "epoch": 1.797909407665505, "grad_norm": 0.5788251161575317, "learning_rate": 1.5636963852028936e-05, "loss": 0.9036, "step": 1290 }, { "epoch": 1.8118466898954704, "grad_norm": 0.5556735396385193, "learning_rate": 1.557358625656976e-05, "loss": 0.9155, "step": 1300 }, { "epoch": 1.8257839721254356, "grad_norm": 0.4880397319793701, "learning_rate": 1.550988221911101e-05, "loss": 0.8849, "step": 1310 }, { "epoch": 1.8397212543554007, "grad_norm": 0.6523249745368958, "learning_rate": 1.5445855470764467e-05, "loss": 0.8644, "step": 1320 }, { "epoch": 1.8536585365853657, "grad_norm": 0.8619920015335083, "learning_rate": 1.5381509761542925e-05, "loss": 0.9073, "step": 1330 }, { "epoch": 1.867595818815331, "grad_norm": 0.6176061034202576, "learning_rate": 1.5316848860140545e-05, "loss": 0.877, "step": 1340 }, { "epoch": 1.8815331010452963, "grad_norm": 0.49368295073509216, "learning_rate": 1.5251876553712129e-05, "loss": 0.8854, "step": 1350 }, { "epoch": 1.8954703832752613, "grad_norm": 0.6543199419975281, "learning_rate": 1.5186596647651299e-05, "loss": 0.883, "step": 1360 }, { "epoch": 1.9094076655052263, "grad_norm": 0.568365216255188, "learning_rate": 1.512101296536764e-05, "loss": 0.9144, "step": 1370 }, { "epoch": 1.9233449477351916, "grad_norm": 0.5592637062072754, "learning_rate": 1.5055129348062733e-05, "loss": 0.8869, "step": 1380 }, { "epoch": 1.9372822299651569, "grad_norm": 0.642049252986908, "learning_rate": 1.4988949654505212e-05, "loss": 0.9268, "step": 1390 }, { "epoch": 1.951219512195122, "grad_norm": 0.8612108826637268, "learning_rate": 1.492247776080472e-05, "loss": 0.9231, "step": 1400 }, { "epoch": 1.965156794425087, "grad_norm": 0.5690594911575317, "learning_rate": 1.4855717560184925e-05, "loss": 0.8862, "step": 1410 }, { "epoch": 1.9790940766550522, "grad_norm": 0.5545530915260315, "learning_rate": 1.4788672962755474e-05, "loss": 0.8777, "step": 1420 }, { "epoch": 1.9930313588850175, "grad_norm": 0.5686807036399841, "learning_rate": 1.4721347895282977e-05, "loss": 0.867, "step": 1430 }, { "epoch": 2.0069686411149825, "grad_norm": 0.49507030844688416, "learning_rate": 1.4653746300961037e-05, "loss": 0.8879, "step": 1440 }, { "epoch": 2.0209059233449476, "grad_norm": 0.5000828504562378, "learning_rate": 1.4585872139179284e-05, "loss": 0.8951, "step": 1450 }, { "epoch": 2.034843205574913, "grad_norm": 0.5445813536643982, "learning_rate": 1.4517729385291479e-05, "loss": 0.8741, "step": 1460 }, { "epoch": 2.048780487804878, "grad_norm": 0.5599672198295593, "learning_rate": 1.4449322030382681e-05, "loss": 0.8956, "step": 1470 }, { "epoch": 2.062717770034843, "grad_norm": 0.579526424407959, "learning_rate": 1.4380654081035492e-05, "loss": 0.8655, "step": 1480 }, { "epoch": 2.076655052264808, "grad_norm": 0.5371329188346863, "learning_rate": 1.4311729559095391e-05, "loss": 0.8916, "step": 1490 }, { "epoch": 2.0905923344947737, "grad_norm": 0.5372903943061829, "learning_rate": 1.424255250143518e-05, "loss": 0.9006, "step": 1500 }, { "epoch": 2.1045296167247387, "grad_norm": 0.5461590886116028, "learning_rate": 1.4173126959718542e-05, "loss": 0.8981, "step": 1510 }, { "epoch": 2.1184668989547037, "grad_norm": 0.5336124897003174, "learning_rate": 1.410345700016274e-05, "loss": 0.8979, "step": 1520 }, { "epoch": 2.132404181184669, "grad_norm": 0.512737512588501, "learning_rate": 1.4033546703300465e-05, "loss": 0.8549, "step": 1530 }, { "epoch": 2.1463414634146343, "grad_norm": 0.5914519429206848, "learning_rate": 1.3963400163740828e-05, "loss": 0.8807, "step": 1540 }, { "epoch": 2.1602787456445993, "grad_norm": 0.6203148365020752, "learning_rate": 1.3893021489929564e-05, "loss": 0.9025, "step": 1550 }, { "epoch": 2.1742160278745644, "grad_norm": 0.47906365990638733, "learning_rate": 1.382241480390837e-05, "loss": 0.9091, "step": 1560 }, { "epoch": 2.1881533101045294, "grad_norm": 1.1542456150054932, "learning_rate": 1.3751584241073517e-05, "loss": 0.8571, "step": 1570 }, { "epoch": 2.202090592334495, "grad_norm": 0.778533935546875, "learning_rate": 1.3680533949933607e-05, "loss": 0.8534, "step": 1580 }, { "epoch": 2.21602787456446, "grad_norm": 0.5771265625953674, "learning_rate": 1.3609268091866621e-05, "loss": 0.8709, "step": 1590 }, { "epoch": 2.229965156794425, "grad_norm": 0.5153730511665344, "learning_rate": 1.3537790840876179e-05, "loss": 0.8865, "step": 1600 }, { "epoch": 2.2439024390243905, "grad_norm": 0.5823934674263, "learning_rate": 1.346610638334707e-05, "loss": 0.8608, "step": 1610 }, { "epoch": 2.2578397212543555, "grad_norm": 0.4887414872646332, "learning_rate": 1.3394218917800064e-05, "loss": 0.8661, "step": 1620 }, { "epoch": 2.2717770034843205, "grad_norm": 0.5397761464118958, "learning_rate": 1.3322132654646003e-05, "loss": 0.8719, "step": 1630 }, { "epoch": 2.2857142857142856, "grad_norm": 0.7656607627868652, "learning_rate": 1.3249851815939197e-05, "loss": 0.8857, "step": 1640 }, { "epoch": 2.2996515679442506, "grad_norm": 0.5524553060531616, "learning_rate": 1.3177380635130144e-05, "loss": 0.8957, "step": 1650 }, { "epoch": 2.313588850174216, "grad_norm": 0.7648917436599731, "learning_rate": 1.3104723356817582e-05, "loss": 0.8746, "step": 1660 }, { "epoch": 2.327526132404181, "grad_norm": 0.696306049823761, "learning_rate": 1.3031884236499877e-05, "loss": 0.8732, "step": 1670 }, { "epoch": 2.341463414634146, "grad_norm": 0.5518249273300171, "learning_rate": 1.2958867540325785e-05, "loss": 0.8641, "step": 1680 }, { "epoch": 2.3554006968641117, "grad_norm": 0.5839936137199402, "learning_rate": 1.2885677544844592e-05, "loss": 0.8317, "step": 1690 }, { "epoch": 2.3693379790940767, "grad_norm": 0.5415021777153015, "learning_rate": 1.2812318536755624e-05, "loss": 0.8815, "step": 1700 }, { "epoch": 2.3832752613240418, "grad_norm": 0.5816763639450073, "learning_rate": 1.2738794812657194e-05, "loss": 0.8682, "step": 1710 }, { "epoch": 2.397212543554007, "grad_norm": 0.5739949941635132, "learning_rate": 1.266511067879494e-05, "loss": 0.8928, "step": 1720 }, { "epoch": 2.4111498257839723, "grad_norm": 0.5285424590110779, "learning_rate": 1.2591270450809612e-05, "loss": 0.9042, "step": 1730 }, { "epoch": 2.4250871080139373, "grad_norm": 0.67451012134552, "learning_rate": 1.251727845348432e-05, "loss": 0.9084, "step": 1740 }, { "epoch": 2.4390243902439024, "grad_norm": 0.6238117218017578, "learning_rate": 1.2443139020491216e-05, "loss": 0.8828, "step": 1750 }, { "epoch": 2.4529616724738674, "grad_norm": 0.527727484703064, "learning_rate": 1.236885649413768e-05, "loss": 0.8348, "step": 1760 }, { "epoch": 2.466898954703833, "grad_norm": 0.6208236813545227, "learning_rate": 1.2294435225112005e-05, "loss": 0.8976, "step": 1770 }, { "epoch": 2.480836236933798, "grad_norm": 0.6415792107582092, "learning_rate": 1.2219879572228555e-05, "loss": 0.853, "step": 1780 }, { "epoch": 2.494773519163763, "grad_norm": 0.5672902464866638, "learning_rate": 1.2145193902172496e-05, "loss": 0.8624, "step": 1790 }, { "epoch": 2.508710801393728, "grad_norm": 0.5251675248146057, "learning_rate": 1.2070382589244026e-05, "loss": 0.8919, "step": 1800 }, { "epoch": 2.5226480836236935, "grad_norm": 0.6049728989601135, "learning_rate": 1.199545001510218e-05, "loss": 0.8417, "step": 1810 }, { "epoch": 2.5365853658536586, "grad_norm": 0.5997565984725952, "learning_rate": 1.1920400568508201e-05, "loss": 0.8831, "step": 1820 }, { "epoch": 2.5505226480836236, "grad_norm": 0.5272901058197021, "learning_rate": 1.184523864506849e-05, "loss": 0.8773, "step": 1830 }, { "epoch": 2.564459930313589, "grad_norm": 0.567862331867218, "learning_rate": 1.1769968646977148e-05, "loss": 0.8595, "step": 1840 }, { "epoch": 2.578397212543554, "grad_norm": 0.5373286008834839, "learning_rate": 1.1694594982758164e-05, "loss": 0.8896, "step": 1850 }, { "epoch": 2.592334494773519, "grad_norm": 0.5112028121948242, "learning_rate": 1.161912206700719e-05, "loss": 0.8882, "step": 1860 }, { "epoch": 2.6062717770034842, "grad_norm": 0.4764540493488312, "learning_rate": 1.154355432013299e-05, "loss": 0.8381, "step": 1870 }, { "epoch": 2.6202090592334493, "grad_norm": 0.7286739349365234, "learning_rate": 1.1467896168098533e-05, "loss": 0.8502, "step": 1880 }, { "epoch": 2.6341463414634148, "grad_norm": 0.5751617550849915, "learning_rate": 1.1392152042161774e-05, "loss": 0.8631, "step": 1890 }, { "epoch": 2.64808362369338, "grad_norm": 0.5550952553749084, "learning_rate": 1.1316326378616121e-05, "loss": 0.9055, "step": 1900 }, { "epoch": 2.662020905923345, "grad_norm": 0.5390698909759521, "learning_rate": 1.1240423618530578e-05, "loss": 0.8586, "step": 1910 }, { "epoch": 2.6759581881533103, "grad_norm": 0.5401940941810608, "learning_rate": 1.1164448207489673e-05, "loss": 0.873, "step": 1920 }, { "epoch": 2.6898954703832754, "grad_norm": 0.7127025723457336, "learning_rate": 1.1088404595333046e-05, "loss": 0.8753, "step": 1930 }, { "epoch": 2.7038327526132404, "grad_norm": 0.6411701440811157, "learning_rate": 1.101229723589485e-05, "loss": 0.8814, "step": 1940 }, { "epoch": 2.7177700348432055, "grad_norm": 0.5122844576835632, "learning_rate": 1.0936130586742881e-05, "loss": 0.8509, "step": 1950 }, { "epoch": 2.7317073170731705, "grad_norm": 2.784543514251709, "learning_rate": 1.0859909108917497e-05, "loss": 0.8112, "step": 1960 }, { "epoch": 2.745644599303136, "grad_norm": 0.533532977104187, "learning_rate": 1.0783637266670348e-05, "loss": 0.8479, "step": 1970 }, { "epoch": 2.759581881533101, "grad_norm": 0.5365408062934875, "learning_rate": 1.0707319527202902e-05, "loss": 0.8281, "step": 1980 }, { "epoch": 2.773519163763066, "grad_norm": 0.45295801758766174, "learning_rate": 1.0630960360404793e-05, "loss": 0.9046, "step": 1990 }, { "epoch": 2.7874564459930316, "grad_norm": 0.656039834022522, "learning_rate": 1.0554564238592051e-05, "loss": 0.8305, "step": 2000 }, { "epoch": 2.8013937282229966, "grad_norm": 0.5675934553146362, "learning_rate": 1.0478135636245122e-05, "loss": 0.8633, "step": 2010 }, { "epoch": 2.8153310104529616, "grad_norm": 0.5480667948722839, "learning_rate": 1.0401679029746828e-05, "loss": 0.8756, "step": 2020 }, { "epoch": 2.8292682926829267, "grad_norm": 0.5900964736938477, "learning_rate": 1.0325198897120183e-05, "loss": 0.8737, "step": 2030 }, { "epoch": 2.8432055749128917, "grad_norm": 0.688490092754364, "learning_rate": 1.0248699717766107e-05, "loss": 0.8425, "step": 2040 }, { "epoch": 2.857142857142857, "grad_norm": 0.5785161256790161, "learning_rate": 1.0172185972201082e-05, "loss": 0.902, "step": 2050 }, { "epoch": 2.8710801393728222, "grad_norm": 0.5259153246879578, "learning_rate": 1.0095662141794725e-05, "loss": 0.8793, "step": 2060 }, { "epoch": 2.8850174216027873, "grad_norm": 0.5888857841491699, "learning_rate": 1.0019132708507307e-05, "loss": 0.8665, "step": 2070 }, { "epoch": 2.8989547038327528, "grad_norm": 0.6237362027168274, "learning_rate": 9.94260215462727e-06, "loss": 0.8647, "step": 2080 }, { "epoch": 2.912891986062718, "grad_norm": 0.5640315413475037, "learning_rate": 9.866074962508684e-06, "loss": 0.8659, "step": 2090 }, { "epoch": 2.926829268292683, "grad_norm": 0.4334649443626404, "learning_rate": 9.789555614308721e-06, "loss": 0.8566, "step": 2100 }, { "epoch": 2.940766550522648, "grad_norm": 0.5068169832229614, "learning_rate": 9.713048591725138e-06, "loss": 0.8712, "step": 2110 }, { "epoch": 2.9547038327526134, "grad_norm": 0.5684682726860046, "learning_rate": 9.63655837573379e-06, "loss": 0.8217, "step": 2120 }, { "epoch": 2.9686411149825784, "grad_norm": 0.6925583481788635, "learning_rate": 9.560089446326175e-06, "loss": 0.8675, "step": 2130 }, { "epoch": 2.9825783972125435, "grad_norm": 0.7573685050010681, "learning_rate": 9.483646282247056e-06, "loss": 0.8369, "step": 2140 }, { "epoch": 2.996515679442509, "grad_norm": 0.598778486251831, "learning_rate": 9.407233360732119e-06, "loss": 0.8434, "step": 2150 }, { "epoch": 3.010452961672474, "grad_norm": 114.71971893310547, "learning_rate": 9.330855157245776e-06, "loss": 0.8841, "step": 2160 }, { "epoch": 3.024390243902439, "grad_norm": 0.5424976348876953, "learning_rate": 9.254516145219006e-06, "loss": 0.8653, "step": 2170 }, { "epoch": 3.038327526132404, "grad_norm": 0.48183199763298035, "learning_rate": 9.17822079578738e-06, "loss": 0.8402, "step": 2180 }, { "epoch": 3.052264808362369, "grad_norm": 0.5667704343795776, "learning_rate": 9.101973577529164e-06, "loss": 0.8357, "step": 2190 }, { "epoch": 3.0662020905923346, "grad_norm": 0.5843963027000427, "learning_rate": 9.025778956203611e-06, "loss": 0.8538, "step": 2200 }, { "epoch": 3.0801393728222997, "grad_norm": 0.5097166895866394, "learning_rate": 8.949641394489399e-06, "loss": 0.8208, "step": 2210 }, { "epoch": 3.0940766550522647, "grad_norm": 0.5178412795066833, "learning_rate": 8.873565351723249e-06, "loss": 0.9026, "step": 2220 }, { "epoch": 3.10801393728223, "grad_norm": 0.6717800498008728, "learning_rate": 8.79755528363876e-06, "loss": 0.8002, "step": 2230 }, { "epoch": 3.1219512195121952, "grad_norm": 0.81369549036026, "learning_rate": 8.721615642105417e-06, "loss": 0.8757, "step": 2240 }, { "epoch": 3.1358885017421603, "grad_norm": 0.574155867099762, "learning_rate": 8.645750874867876e-06, "loss": 0.8411, "step": 2250 }, { "epoch": 3.1498257839721253, "grad_norm": 0.4896714985370636, "learning_rate": 8.56996542528542e-06, "loss": 0.8671, "step": 2260 }, { "epoch": 3.1637630662020904, "grad_norm": 0.5032427906990051, "learning_rate": 8.494263732071772e-06, "loss": 0.8521, "step": 2270 }, { "epoch": 3.177700348432056, "grad_norm": 0.5645169615745544, "learning_rate": 8.418650229035054e-06, "loss": 0.8407, "step": 2280 }, { "epoch": 3.191637630662021, "grad_norm": 0.5049313306808472, "learning_rate": 8.343129344818162e-06, "loss": 0.853, "step": 2290 }, { "epoch": 3.205574912891986, "grad_norm": 0.5244989991188049, "learning_rate": 8.267705502639342e-06, "loss": 0.8546, "step": 2300 }, { "epoch": 3.2195121951219514, "grad_norm": 0.6323722004890442, "learning_rate": 8.192383120033147e-06, "loss": 0.8408, "step": 2310 }, { "epoch": 3.2334494773519165, "grad_norm": 0.5894546508789062, "learning_rate": 8.117166608591693e-06, "loss": 0.865, "step": 2320 }, { "epoch": 3.2473867595818815, "grad_norm": 1.284786343574524, "learning_rate": 8.042060373706275e-06, "loss": 0.8596, "step": 2330 }, { "epoch": 3.2613240418118465, "grad_norm": 0.5084718465805054, "learning_rate": 7.967068814309359e-06, "loss": 0.8377, "step": 2340 }, { "epoch": 3.275261324041812, "grad_norm": 0.5845734477043152, "learning_rate": 7.892196322616912e-06, "loss": 0.8597, "step": 2350 }, { "epoch": 3.289198606271777, "grad_norm": 0.5465214252471924, "learning_rate": 7.817447283871187e-06, "loss": 0.8584, "step": 2360 }, { "epoch": 3.303135888501742, "grad_norm": 0.5865809917449951, "learning_rate": 7.742826076083848e-06, "loss": 0.843, "step": 2370 }, { "epoch": 3.317073170731707, "grad_norm": 0.455839604139328, "learning_rate": 7.668337069779577e-06, "loss": 0.8599, "step": 2380 }, { "epoch": 3.3310104529616726, "grad_norm": 0.48517608642578125, "learning_rate": 7.593984627740075e-06, "loss": 0.8592, "step": 2390 }, { "epoch": 3.3449477351916377, "grad_norm": 0.5980703830718994, "learning_rate": 7.519773104748562e-06, "loss": 0.8673, "step": 2400 }, { "epoch": 3.3588850174216027, "grad_norm": 0.5855985879898071, "learning_rate": 7.4457068473346836e-06, "loss": 0.8518, "step": 2410 }, { "epoch": 3.3728222996515678, "grad_norm": 0.4601869583129883, "learning_rate": 7.371790193519979e-06, "loss": 0.8457, "step": 2420 }, { "epoch": 3.3867595818815333, "grad_norm": 0.5327535271644592, "learning_rate": 7.298027472563768e-06, "loss": 0.8373, "step": 2430 }, { "epoch": 3.4006968641114983, "grad_norm": 0.5465139746665955, "learning_rate": 7.224423004709607e-06, "loss": 0.8515, "step": 2440 }, { "epoch": 3.4146341463414633, "grad_norm": 0.5567154884338379, "learning_rate": 7.1509811009322574e-06, "loss": 0.8541, "step": 2450 }, { "epoch": 3.4285714285714284, "grad_norm": 0.5773440599441528, "learning_rate": 7.077706062685181e-06, "loss": 0.849, "step": 2460 }, { "epoch": 3.442508710801394, "grad_norm": 0.5186436176300049, "learning_rate": 7.004602181648626e-06, "loss": 0.8857, "step": 2470 }, { "epoch": 3.456445993031359, "grad_norm": 0.647905707359314, "learning_rate": 6.931673739478235e-06, "loss": 0.8486, "step": 2480 }, { "epoch": 3.470383275261324, "grad_norm": 0.7109155058860779, "learning_rate": 6.858925007554308e-06, "loss": 0.8703, "step": 2490 }, { "epoch": 3.484320557491289, "grad_norm": 0.5457651019096375, "learning_rate": 6.786360246731595e-06, "loss": 0.8494, "step": 2500 }, { "epoch": 3.4982578397212545, "grad_norm": 0.49579504132270813, "learning_rate": 6.713983707089773e-06, "loss": 0.848, "step": 2510 }, { "epoch": 3.5121951219512195, "grad_norm": 0.472918838262558, "learning_rate": 6.641799627684481e-06, "loss": 0.8633, "step": 2520 }, { "epoch": 3.5261324041811846, "grad_norm": 0.6405051946640015, "learning_rate": 6.569812236299089e-06, "loss": 0.8672, "step": 2530 }, { "epoch": 3.54006968641115, "grad_norm": 0.5040938258171082, "learning_rate": 6.498025749197036e-06, "loss": 0.847, "step": 2540 }, { "epoch": 3.554006968641115, "grad_norm": 0.5406576991081238, "learning_rate": 6.426444370874906e-06, "loss": 0.8291, "step": 2550 }, { "epoch": 3.56794425087108, "grad_norm": 0.47771602869033813, "learning_rate": 6.355072293816178e-06, "loss": 0.8522, "step": 2560 }, { "epoch": 3.581881533101045, "grad_norm": 0.5669821500778198, "learning_rate": 6.283913698245659e-06, "loss": 0.8316, "step": 2570 }, { "epoch": 3.59581881533101, "grad_norm": 0.6127913594245911, "learning_rate": 6.212972751884663e-06, "loss": 0.8686, "step": 2580 }, { "epoch": 3.6097560975609757, "grad_norm": 0.5714460015296936, "learning_rate": 6.142253609706898e-06, "loss": 0.8493, "step": 2590 }, { "epoch": 3.6236933797909407, "grad_norm": 0.5862120389938354, "learning_rate": 6.0717604136951315e-06, "loss": 0.8962, "step": 2600 }, { "epoch": 3.637630662020906, "grad_norm": 0.5323344469070435, "learning_rate": 6.001497292598566e-06, "loss": 0.8615, "step": 2610 }, { "epoch": 3.6515679442508713, "grad_norm": 0.5613053441047668, "learning_rate": 5.931468361691053e-06, "loss": 0.8823, "step": 2620 }, { "epoch": 3.6655052264808363, "grad_norm": 0.512625515460968, "learning_rate": 5.861677722530037e-06, "loss": 0.8505, "step": 2630 }, { "epoch": 3.6794425087108014, "grad_norm": 0.5254554152488708, "learning_rate": 5.792129462716355e-06, "loss": 0.8456, "step": 2640 }, { "epoch": 3.6933797909407664, "grad_norm": 0.5453774929046631, "learning_rate": 5.722827655654801e-06, "loss": 0.862, "step": 2650 }, { "epoch": 3.7073170731707314, "grad_norm": 0.5125150680541992, "learning_rate": 5.653776360315562e-06, "loss": 0.8497, "step": 2660 }, { "epoch": 3.721254355400697, "grad_norm": 0.5420352816581726, "learning_rate": 5.584979620996491e-06, "loss": 0.8507, "step": 2670 }, { "epoch": 3.735191637630662, "grad_norm": 0.6987497210502625, "learning_rate": 5.516441467086231e-06, "loss": 0.8596, "step": 2680 }, { "epoch": 3.749128919860627, "grad_norm": 0.6294048428535461, "learning_rate": 5.448165912828214e-06, "loss": 0.8402, "step": 2690 }, { "epoch": 3.7630662020905925, "grad_norm": 0.4914676547050476, "learning_rate": 5.380156957085536e-06, "loss": 0.8544, "step": 2700 }, { "epoch": 3.7770034843205575, "grad_norm": 0.46825504302978516, "learning_rate": 5.312418583106784e-06, "loss": 0.8307, "step": 2710 }, { "epoch": 3.7909407665505226, "grad_norm": 0.6263434290885925, "learning_rate": 5.244954758292691e-06, "loss": 0.8472, "step": 2720 }, { "epoch": 3.8048780487804876, "grad_norm": 0.6025940179824829, "learning_rate": 5.177769433963801e-06, "loss": 0.8388, "step": 2730 }, { "epoch": 3.818815331010453, "grad_norm": 0.6129311919212341, "learning_rate": 5.110866545129031e-06, "loss": 0.8647, "step": 2740 }, { "epoch": 3.832752613240418, "grad_norm": 0.5434401035308838, "learning_rate": 5.044250010255202e-06, "loss": 0.8224, "step": 2750 }, { "epoch": 3.846689895470383, "grad_norm": 0.5361849069595337, "learning_rate": 4.97792373103753e-06, "loss": 0.8677, "step": 2760 }, { "epoch": 3.8606271777003487, "grad_norm": 0.5294119715690613, "learning_rate": 4.911891592171113e-06, "loss": 0.8471, "step": 2770 }, { "epoch": 3.8745644599303137, "grad_norm": 0.3936956524848938, "learning_rate": 4.846157461123411e-06, "loss": 0.8718, "step": 2780 }, { "epoch": 3.8885017421602788, "grad_norm": 0.5664070844650269, "learning_rate": 4.780725187907707e-06, "loss": 0.8424, "step": 2790 }, { "epoch": 3.902439024390244, "grad_norm": 0.6031394600868225, "learning_rate": 4.715598604857648e-06, "loss": 0.8469, "step": 2800 }, { "epoch": 3.916376306620209, "grad_norm": 0.5560880899429321, "learning_rate": 4.65078152640276e-06, "loss": 0.8626, "step": 2810 }, { "epoch": 3.9303135888501743, "grad_norm": 0.5566712021827698, "learning_rate": 4.586277748845055e-06, "loss": 0.9251, "step": 2820 }, { "epoch": 3.9442508710801394, "grad_norm": 0.5782540440559387, "learning_rate": 4.5220910501366635e-06, "loss": 0.8321, "step": 2830 }, { "epoch": 3.9581881533101044, "grad_norm": 0.5216631889343262, "learning_rate": 4.458225189658598e-06, "loss": 0.8293, "step": 2840 }, { "epoch": 3.97212543554007, "grad_norm": 0.511968195438385, "learning_rate": 4.3946839080005236e-06, "loss": 0.8238, "step": 2850 }, { "epoch": 3.986062717770035, "grad_norm": 0.49393701553344727, "learning_rate": 4.331470926741707e-06, "loss": 0.885, "step": 2860 }, { "epoch": 4.0, "grad_norm": 0.4580581486225128, "learning_rate": 4.268589948233034e-06, "loss": 0.8487, "step": 2870 }, { "epoch": 4.013937282229965, "grad_norm": 0.4895112216472626, "learning_rate": 4.2060446553801585e-06, "loss": 0.8222, "step": 2880 }, { "epoch": 4.02787456445993, "grad_norm": 0.5034216046333313, "learning_rate": 4.143838711427808e-06, "loss": 0.8645, "step": 2890 }, { "epoch": 4.041811846689895, "grad_norm": 0.5985310673713684, "learning_rate": 4.0819757597452246e-06, "loss": 0.8652, "step": 2900 }, { "epoch": 4.055749128919861, "grad_norm": 0.4828563332557678, "learning_rate": 4.020459423612777e-06, "loss": 0.8602, "step": 2910 }, { "epoch": 4.069686411149826, "grad_norm": 0.48318225145339966, "learning_rate": 3.959293306009734e-06, "loss": 0.8541, "step": 2920 }, { "epoch": 4.083623693379791, "grad_norm": 0.5043054223060608, "learning_rate": 3.89848098940326e-06, "loss": 0.857, "step": 2930 }, { "epoch": 4.097560975609756, "grad_norm": 0.5322110652923584, "learning_rate": 3.838026035538581e-06, "loss": 0.8419, "step": 2940 }, { "epoch": 4.111498257839721, "grad_norm": 0.7655691504478455, "learning_rate": 3.7779319852303766e-06, "loss": 0.8551, "step": 2950 }, { "epoch": 4.125435540069686, "grad_norm": 0.5095774531364441, "learning_rate": 3.718202358155384e-06, "loss": 0.838, "step": 2960 }, { "epoch": 4.139372822299651, "grad_norm": 0.5320903658866882, "learning_rate": 3.658840652646287e-06, "loss": 0.8044, "step": 2970 }, { "epoch": 4.153310104529616, "grad_norm": 0.5706301927566528, "learning_rate": 3.5998503454867807e-06, "loss": 0.858, "step": 2980 }, { "epoch": 4.167247386759582, "grad_norm": 1.014062523841858, "learning_rate": 3.5412348917079507e-06, "loss": 0.8739, "step": 2990 }, { "epoch": 4.181184668989547, "grad_norm": 0.5381621718406677, "learning_rate": 3.4829977243859414e-06, "loss": 0.8082, "step": 3000 }, { "epoch": 4.195121951219512, "grad_norm": 0.48279663920402527, "learning_rate": 3.425142254440835e-06, "loss": 0.8335, "step": 3010 }, { "epoch": 4.209059233449477, "grad_norm": 0.6682676076889038, "learning_rate": 3.367671870436915e-06, "loss": 0.8484, "step": 3020 }, { "epoch": 4.2229965156794425, "grad_norm": 0.5164760947227478, "learning_rate": 3.310589938384179e-06, "loss": 0.8228, "step": 3030 }, { "epoch": 4.2369337979094075, "grad_norm": 0.5330147743225098, "learning_rate": 3.253899801541206e-06, "loss": 0.8475, "step": 3040 }, { "epoch": 4.2508710801393725, "grad_norm": 0.5183762907981873, "learning_rate": 3.197604780219323e-06, "loss": 0.8228, "step": 3050 }, { "epoch": 4.264808362369338, "grad_norm": 0.5862233638763428, "learning_rate": 3.1417081715881623e-06, "loss": 0.8419, "step": 3060 }, { "epoch": 4.2787456445993035, "grad_norm": 0.5445356369018555, "learning_rate": 3.0862132494825325e-06, "loss": 0.875, "step": 3070 }, { "epoch": 4.2926829268292686, "grad_norm": 0.5905585885047913, "learning_rate": 3.0311232642106768e-06, "loss": 0.8548, "step": 3080 }, { "epoch": 4.306620209059234, "grad_norm": 0.5876056551933289, "learning_rate": 2.976441442363893e-06, "loss": 0.8812, "step": 3090 }, { "epoch": 4.320557491289199, "grad_norm": 0.5916198492050171, "learning_rate": 2.922170986627573e-06, "loss": 0.8289, "step": 3100 }, { "epoch": 4.334494773519164, "grad_norm": 0.555949866771698, "learning_rate": 2.8683150755936107e-06, "loss": 0.8822, "step": 3110 }, { "epoch": 4.348432055749129, "grad_norm": 0.5823965668678284, "learning_rate": 2.8148768635742286e-06, "loss": 0.8308, "step": 3120 }, { "epoch": 4.362369337979094, "grad_norm": 0.6472144722938538, "learning_rate": 2.761859480417255e-06, "loss": 0.8368, "step": 3130 }, { "epoch": 4.376306620209059, "grad_norm": 0.5416210293769836, "learning_rate": 2.7092660313227748e-06, "loss": 0.8655, "step": 3140 }, { "epoch": 4.390243902439025, "grad_norm": 0.5069013833999634, "learning_rate": 2.6570995966612945e-06, "loss": 0.8657, "step": 3150 }, { "epoch": 4.40418118466899, "grad_norm": 0.4444841146469116, "learning_rate": 2.605363231793302e-06, "loss": 0.8362, "step": 3160 }, { "epoch": 4.418118466898955, "grad_norm": 0.5552143454551697, "learning_rate": 2.554059966890332e-06, "loss": 0.8027, "step": 3170 }, { "epoch": 4.43205574912892, "grad_norm": 0.7829400897026062, "learning_rate": 2.503192806757474e-06, "loss": 0.8351, "step": 3180 }, { "epoch": 4.445993031358885, "grad_norm": 0.477295458316803, "learning_rate": 2.4527647306574e-06, "loss": 0.8102, "step": 3190 }, { "epoch": 4.45993031358885, "grad_norm": 0.5426877737045288, "learning_rate": 2.402778692135861e-06, "loss": 0.8406, "step": 3200 }, { "epoch": 4.473867595818815, "grad_norm": 0.5670856237411499, "learning_rate": 2.353237618848695e-06, "loss": 0.8258, "step": 3210 }, { "epoch": 4.487804878048781, "grad_norm": 0.4924805462360382, "learning_rate": 2.304144412390367e-06, "loss": 0.8303, "step": 3220 }, { "epoch": 4.501742160278746, "grad_norm": 0.5142589807510376, "learning_rate": 2.255501948124017e-06, "loss": 0.8714, "step": 3230 }, { "epoch": 4.515679442508711, "grad_norm": 0.546807587146759, "learning_rate": 2.207313075013059e-06, "loss": 0.8221, "step": 3240 }, { "epoch": 4.529616724738676, "grad_norm": 0.5798578858375549, "learning_rate": 2.1595806154542965e-06, "loss": 0.8625, "step": 3250 }, { "epoch": 4.543554006968641, "grad_norm": 0.567733108997345, "learning_rate": 2.112307365112657e-06, "loss": 0.8242, "step": 3260 }, { "epoch": 4.557491289198606, "grad_norm": 0.5294705033302307, "learning_rate": 2.065496092757403e-06, "loss": 0.8721, "step": 3270 }, { "epoch": 4.571428571428571, "grad_norm": 0.49266737699508667, "learning_rate": 2.019149540100005e-06, "loss": 0.8576, "step": 3280 }, { "epoch": 4.585365853658536, "grad_norm": 0.5400304198265076, "learning_rate": 1.973270421633543e-06, "loss": 0.8899, "step": 3290 }, { "epoch": 4.599303135888501, "grad_norm": 0.5392309427261353, "learning_rate": 1.927861424473726e-06, "loss": 0.8227, "step": 3300 }, { "epoch": 4.613240418118467, "grad_norm": 0.5385839343070984, "learning_rate": 1.882925208201498e-06, "loss": 0.838, "step": 3310 }, { "epoch": 4.627177700348432, "grad_norm": 0.5418098568916321, "learning_rate": 1.8384644047072864e-06, "loss": 0.8612, "step": 3320 }, { "epoch": 4.641114982578397, "grad_norm": 0.5207215547561646, "learning_rate": 1.7944816180368408e-06, "loss": 0.8356, "step": 3330 }, { "epoch": 4.655052264808362, "grad_norm": 0.9076542854309082, "learning_rate": 1.7509794242387135e-06, "loss": 0.8224, "step": 3340 }, { "epoch": 4.668989547038327, "grad_norm": 0.4973151385784149, "learning_rate": 1.7079603712133908e-06, "loss": 0.8678, "step": 3350 }, { "epoch": 4.682926829268292, "grad_norm": 0.5431068539619446, "learning_rate": 1.6654269785640608e-06, "loss": 0.8467, "step": 3360 }, { "epoch": 4.696864111498257, "grad_norm": 0.4680764079093933, "learning_rate": 1.623381737449038e-06, "loss": 0.8364, "step": 3370 }, { "epoch": 4.710801393728223, "grad_norm": 0.507692813873291, "learning_rate": 1.5818271104358574e-06, "loss": 0.8854, "step": 3380 }, { "epoch": 4.724738675958188, "grad_norm": 0.758124828338623, "learning_rate": 1.5407655313570525e-06, "loss": 0.8534, "step": 3390 }, { "epoch": 4.7386759581881535, "grad_norm": 0.6623209118843079, "learning_rate": 1.5001994051675894e-06, "loss": 0.8814, "step": 3400 }, { "epoch": 4.7526132404181185, "grad_norm": 0.5073679089546204, "learning_rate": 1.4601311078040304e-06, "loss": 0.8457, "step": 3410 }, { "epoch": 4.7665505226480835, "grad_norm": 0.500339150428772, "learning_rate": 1.4205629860453641e-06, "loss": 0.842, "step": 3420 }, { "epoch": 4.780487804878049, "grad_norm": 0.8517163395881653, "learning_rate": 1.3814973573755518e-06, "loss": 0.8982, "step": 3430 }, { "epoch": 4.794425087108014, "grad_norm": 0.5409610867500305, "learning_rate": 1.3429365098478087e-06, "loss": 0.8492, "step": 3440 }, { "epoch": 4.80836236933798, "grad_norm": 0.5923708081245422, "learning_rate": 1.3048827019505828e-06, "loss": 0.8548, "step": 3450 }, { "epoch": 4.822299651567945, "grad_norm": 0.4563385844230652, "learning_rate": 1.2673381624752813e-06, "loss": 0.8518, "step": 3460 }, { "epoch": 4.83623693379791, "grad_norm": 0.89119553565979, "learning_rate": 1.2303050903857195e-06, "loss": 0.8355, "step": 3470 }, { "epoch": 4.850174216027875, "grad_norm": 0.5999810695648193, "learning_rate": 1.1937856546893533e-06, "loss": 0.8347, "step": 3480 }, { "epoch": 4.86411149825784, "grad_norm": 0.6495084166526794, "learning_rate": 1.1577819943102132e-06, "loss": 0.7981, "step": 3490 }, { "epoch": 4.878048780487805, "grad_norm": 0.5115063190460205, "learning_rate": 1.122296217963651e-06, "loss": 0.8321, "step": 3500 }, { "epoch": 4.89198606271777, "grad_norm": 0.5369818806648254, "learning_rate": 1.0873304040328193e-06, "loss": 0.8499, "step": 3510 }, { "epoch": 4.905923344947735, "grad_norm": 0.523055374622345, "learning_rate": 1.052886600446954e-06, "loss": 0.8561, "step": 3520 }, { "epoch": 4.9198606271777, "grad_norm": 0.6084771156311035, "learning_rate": 1.0189668245614092e-06, "loss": 0.854, "step": 3530 }, { "epoch": 4.933797909407666, "grad_norm": 0.4877120852470398, "learning_rate": 9.855730630395244e-07, "loss": 0.8161, "step": 3540 }, { "epoch": 4.947735191637631, "grad_norm": 0.6254660487174988, "learning_rate": 9.52707271736254e-07, "loss": 0.8705, "step": 3550 }, { "epoch": 4.961672473867596, "grad_norm": 0.6138275861740112, "learning_rate": 9.203713755836108e-07, "loss": 0.8599, "step": 3560 }, { "epoch": 4.975609756097561, "grad_norm": 0.5839621424674988, "learning_rate": 8.885672684779345e-07, "loss": 0.8536, "step": 3570 }, { "epoch": 4.989547038327526, "grad_norm": 0.544059157371521, "learning_rate": 8.572968131689585e-07, "loss": 0.8536, "step": 3580 }, { "epoch": 5.003484320557491, "grad_norm": 0.5101696848869324, "learning_rate": 8.265618411507148e-07, "loss": 0.84, "step": 3590 }, { "epoch": 5.017421602787456, "grad_norm": 0.5106877684593201, "learning_rate": 7.963641525542564e-07, "loss": 0.8752, "step": 3600 }, { "epoch": 5.031358885017422, "grad_norm": 0.5784628391265869, "learning_rate": 7.667055160422432e-07, "loss": 0.8417, "step": 3610 }, { "epoch": 5.045296167247387, "grad_norm": 0.5501140356063843, "learning_rate": 7.375876687053252e-07, "loss": 0.8473, "step": 3620 }, { "epoch": 5.059233449477352, "grad_norm": 0.4966902434825897, "learning_rate": 7.090123159604234e-07, "loss": 0.8414, "step": 3630 }, { "epoch": 5.073170731707317, "grad_norm": 0.769673764705658, "learning_rate": 6.809811314508386e-07, "loss": 0.8604, "step": 3640 }, { "epoch": 5.087108013937282, "grad_norm": 0.55353182554245, "learning_rate": 6.534957569482214e-07, "loss": 0.8601, "step": 3650 }, { "epoch": 5.101045296167247, "grad_norm": 0.6018425822257996, "learning_rate": 6.265578022564233e-07, "loss": 0.8661, "step": 3660 }, { "epoch": 5.114982578397212, "grad_norm": 0.534132182598114, "learning_rate": 6.001688451172027e-07, "loss": 0.8218, "step": 3670 }, { "epoch": 5.128919860627177, "grad_norm": 0.5793606042861938, "learning_rate": 5.743304311178289e-07, "loss": 0.8399, "step": 3680 }, { "epoch": 5.142857142857143, "grad_norm": 0.6463719010353088, "learning_rate": 5.490440736005397e-07, "loss": 0.8249, "step": 3690 }, { "epoch": 5.156794425087108, "grad_norm": 0.5509739518165588, "learning_rate": 5.24311253573927e-07, "loss": 0.8125, "step": 3700 }, { "epoch": 5.170731707317073, "grad_norm": 0.4920913279056549, "learning_rate": 5.001334196261776e-07, "loss": 0.8701, "step": 3710 }, { "epoch": 5.184668989547038, "grad_norm": 0.5054136514663696, "learning_rate": 4.765119878402424e-07, "loss": 0.8548, "step": 3720 }, { "epoch": 5.198606271777003, "grad_norm": 0.5016888380050659, "learning_rate": 4.5344834171088594e-07, "loss": 0.837, "step": 3730 }, { "epoch": 5.2125435540069684, "grad_norm": 0.5839513540267944, "learning_rate": 4.309438320636705e-07, "loss": 0.8781, "step": 3740 }, { "epoch": 5.2264808362369335, "grad_norm": 0.5476316213607788, "learning_rate": 4.089997769758225e-07, "loss": 0.8616, "step": 3750 }, { "epoch": 5.2404181184668985, "grad_norm": 0.562765896320343, "learning_rate": 3.876174616990402e-07, "loss": 0.8614, "step": 3760 }, { "epoch": 5.2543554006968645, "grad_norm": 0.5283138751983643, "learning_rate": 3.6679813858422673e-07, "loss": 0.8013, "step": 3770 }, { "epoch": 5.2682926829268295, "grad_norm": 0.4689981937408447, "learning_rate": 3.46543027008126e-07, "loss": 0.7839, "step": 3780 }, { "epoch": 5.2822299651567945, "grad_norm": 0.46892431378364563, "learning_rate": 3.2685331330190916e-07, "loss": 0.8268, "step": 3790 }, { "epoch": 5.29616724738676, "grad_norm": 0.5027751326560974, "learning_rate": 3.0773015068169876e-07, "loss": 0.8209, "step": 3800 }, { "epoch": 5.310104529616725, "grad_norm": 0.6267147660255432, "learning_rate": 2.891746591810152e-07, "loss": 0.8121, "step": 3810 }, { "epoch": 5.32404181184669, "grad_norm": 0.5063154697418213, "learning_rate": 2.7118792558518237e-07, "loss": 0.8622, "step": 3820 }, { "epoch": 5.337979094076655, "grad_norm": 0.5375659465789795, "learning_rate": 2.5377100336767547e-07, "loss": 0.8033, "step": 3830 }, { "epoch": 5.351916376306621, "grad_norm": 0.5449223518371582, "learning_rate": 2.3692491262841788e-07, "loss": 0.848, "step": 3840 }, { "epoch": 5.365853658536586, "grad_norm": 0.5025774240493774, "learning_rate": 2.206506400340369e-07, "loss": 0.8403, "step": 3850 }, { "epoch": 5.379790940766551, "grad_norm": 0.4367560148239136, "learning_rate": 2.0494913876007105e-07, "loss": 0.8539, "step": 3860 }, { "epoch": 5.393728222996516, "grad_norm": 0.5829946994781494, "learning_rate": 1.8982132843514577e-07, "loss": 0.8371, "step": 3870 }, { "epoch": 5.407665505226481, "grad_norm": 0.5280970931053162, "learning_rate": 1.752680950871144e-07, "loss": 0.8705, "step": 3880 }, { "epoch": 5.421602787456446, "grad_norm": 0.5541146993637085, "learning_rate": 1.6129029109115401e-07, "loss": 0.8514, "step": 3890 }, { "epoch": 5.435540069686411, "grad_norm": 0.5944886207580566, "learning_rate": 1.4788873511985656e-07, "loss": 0.8545, "step": 3900 }, { "epoch": 5.449477351916376, "grad_norm": 0.5219863653182983, "learning_rate": 1.350642120952661e-07, "loss": 0.9, "step": 3910 }, { "epoch": 5.463414634146342, "grad_norm": 0.5505541563034058, "learning_rate": 1.2281747314291437e-07, "loss": 0.8239, "step": 3920 }, { "epoch": 5.477351916376307, "grad_norm": 0.551377534866333, "learning_rate": 1.1114923554782608e-07, "loss": 0.8817, "step": 3930 }, { "epoch": 5.491289198606272, "grad_norm": 0.5536546111106873, "learning_rate": 1.0006018271250695e-07, "loss": 0.8719, "step": 3940 }, { "epoch": 5.505226480836237, "grad_norm": 0.48071563243865967, "learning_rate": 8.955096411691566e-08, "loss": 0.8517, "step": 3950 }, { "epoch": 5.519163763066202, "grad_norm": 0.5239782929420471, "learning_rate": 7.962219528042991e-08, "loss": 0.8284, "step": 3960 }, { "epoch": 5.533101045296167, "grad_norm": 0.5262070298194885, "learning_rate": 7.027445772578856e-08, "loss": 0.8277, "step": 3970 }, { "epoch": 5.547038327526132, "grad_norm": 0.5303363800048828, "learning_rate": 6.150829894503662e-08, "loss": 0.8648, "step": 3980 }, { "epoch": 5.560975609756097, "grad_norm": 0.5235099196434021, "learning_rate": 5.332423236745765e-08, "loss": 0.8722, "step": 3990 }, { "epoch": 5.574912891986063, "grad_norm": 0.5281161665916443, "learning_rate": 4.5722737329505495e-08, "loss": 0.8452, "step": 4000 }, { "epoch": 5.588850174216028, "grad_norm": 0.6809967756271362, "learning_rate": 3.870425904672237e-08, "loss": 0.8571, "step": 4010 }, { "epoch": 5.602787456445993, "grad_norm": 0.5919767618179321, "learning_rate": 3.22692085876708e-08, "loss": 0.8392, "step": 4020 }, { "epoch": 5.616724738675958, "grad_norm": 0.5929062962532043, "learning_rate": 2.6417962849852875e-08, "loss": 0.7991, "step": 4030 }, { "epoch": 5.630662020905923, "grad_norm": 85.85006713867188, "learning_rate": 2.1150864537636817e-08, "loss": 0.8357, "step": 4040 }, { "epoch": 5.644599303135888, "grad_norm": 0.5508357286453247, "learning_rate": 1.646822214218524e-08, "loss": 0.8502, "step": 4050 }, { "epoch": 5.658536585365853, "grad_norm": 0.5149642825126648, "learning_rate": 1.2370309923388501e-08, "loss": 0.8546, "step": 4060 }, { "epoch": 5.672473867595819, "grad_norm": 0.601134181022644, "learning_rate": 8.857367893796431e-09, "loss": 0.8809, "step": 4070 }, { "epoch": 5.686411149825784, "grad_norm": 0.6303636431694031, "learning_rate": 5.929601804566254e-09, "loss": 0.8678, "step": 4080 }, { "epoch": 5.700348432055749, "grad_norm": 0.5462765097618103, "learning_rate": 3.5871831334099992e-09, "loss": 0.843, "step": 4090 }, { "epoch": 5.714285714285714, "grad_norm": 0.6144183278083801, "learning_rate": 1.8302490745503166e-09, "loss": 0.8146, "step": 4100 }, { "epoch": 5.7282229965156795, "grad_norm": 0.5314708352088928, "learning_rate": 6.589025306869002e-10, "loss": 0.8237, "step": 4110 }, { "epoch": 5.7421602787456445, "grad_norm": 0.56773841381073, "learning_rate": 7.321210696464853e-11, "loss": 0.8358, "step": 4120 } ], "logging_steps": 10, "max_steps": 4125, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.88632668766208e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }