{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1905, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015748031496062992, "grad_norm": 53.176956950673706, "learning_rate": 2.617801047120419e-07, "loss": 11.2422, "step": 1 }, { "epoch": 0.0031496062992125984, "grad_norm": 53.89222994841817, "learning_rate": 5.235602094240838e-07, "loss": 11.2087, "step": 2 }, { "epoch": 0.004724409448818898, "grad_norm": 54.59497639466375, "learning_rate": 7.853403141361256e-07, "loss": 11.0427, "step": 3 }, { "epoch": 0.006299212598425197, "grad_norm": 52.921891852890354, "learning_rate": 1.0471204188481676e-06, "loss": 11.1706, "step": 4 }, { "epoch": 0.007874015748031496, "grad_norm": 52.55813322058068, "learning_rate": 1.3089005235602096e-06, "loss": 10.9252, "step": 5 }, { "epoch": 0.009448818897637795, "grad_norm": 54.57677484606936, "learning_rate": 1.5706806282722513e-06, "loss": 11.1451, "step": 6 }, { "epoch": 0.011023622047244094, "grad_norm": 59.198622760604174, "learning_rate": 1.832460732984293e-06, "loss": 10.8864, "step": 7 }, { "epoch": 0.012598425196850394, "grad_norm": 58.63171513992076, "learning_rate": 2.094240837696335e-06, "loss": 10.769, "step": 8 }, { "epoch": 0.014173228346456693, "grad_norm": 60.329265320989414, "learning_rate": 2.356020942408377e-06, "loss": 10.793, "step": 9 }, { "epoch": 0.015748031496062992, "grad_norm": 83.42720956951872, "learning_rate": 2.617801047120419e-06, "loss": 9.5334, "step": 10 }, { "epoch": 0.01732283464566929, "grad_norm": 89.90734135114184, "learning_rate": 2.879581151832461e-06, "loss": 9.2727, "step": 11 }, { "epoch": 0.01889763779527559, "grad_norm": 96.16733643027241, "learning_rate": 3.1413612565445026e-06, "loss": 9.0491, "step": 12 }, { "epoch": 0.02047244094488189, "grad_norm": 68.17670276588014, "learning_rate": 3.4031413612565448e-06, "loss": 3.7951, "step": 13 }, { "epoch": 0.02204724409448819, "grad_norm": 60.98565312163723, "learning_rate": 3.664921465968586e-06, "loss": 3.5871, "step": 14 }, { "epoch": 0.023622047244094488, "grad_norm": 43.12264348180443, "learning_rate": 3.926701570680629e-06, "loss": 2.8243, "step": 15 }, { "epoch": 0.025196850393700787, "grad_norm": 36.73246874688359, "learning_rate": 4.18848167539267e-06, "loss": 2.5769, "step": 16 }, { "epoch": 0.026771653543307086, "grad_norm": 8.249474626813296, "learning_rate": 4.450261780104712e-06, "loss": 1.4873, "step": 17 }, { "epoch": 0.028346456692913385, "grad_norm": 5.119009867871627, "learning_rate": 4.712041884816754e-06, "loss": 1.2998, "step": 18 }, { "epoch": 0.029921259842519685, "grad_norm": 4.245542803677077, "learning_rate": 4.973821989528796e-06, "loss": 1.2108, "step": 19 }, { "epoch": 0.031496062992125984, "grad_norm": 3.3966854183297013, "learning_rate": 5.235602094240838e-06, "loss": 1.1932, "step": 20 }, { "epoch": 0.03307086614173228, "grad_norm": 2.6300577479492477, "learning_rate": 5.49738219895288e-06, "loss": 1.1517, "step": 21 }, { "epoch": 0.03464566929133858, "grad_norm": 2.1461806993490895, "learning_rate": 5.759162303664922e-06, "loss": 1.0995, "step": 22 }, { "epoch": 0.03622047244094488, "grad_norm": 1.5338160794899627, "learning_rate": 6.0209424083769635e-06, "loss": 0.9951, "step": 23 }, { "epoch": 0.03779527559055118, "grad_norm": 58.98896148511466, "learning_rate": 6.282722513089005e-06, "loss": 0.9924, "step": 24 }, { "epoch": 0.03937007874015748, "grad_norm": 31.57342949486313, "learning_rate": 6.544502617801048e-06, "loss": 0.9421, "step": 25 }, { "epoch": 0.04094488188976378, "grad_norm": 2.397065575081416, "learning_rate": 6.8062827225130895e-06, "loss": 0.8934, "step": 26 }, { "epoch": 0.04251968503937008, "grad_norm": 1.2206267852738608, "learning_rate": 7.068062827225132e-06, "loss": 0.8799, "step": 27 }, { "epoch": 0.04409448818897638, "grad_norm": 1.0703330703938259, "learning_rate": 7.329842931937172e-06, "loss": 0.8559, "step": 28 }, { "epoch": 0.04566929133858268, "grad_norm": 0.9699588888011453, "learning_rate": 7.591623036649215e-06, "loss": 0.8701, "step": 29 }, { "epoch": 0.047244094488188976, "grad_norm": 0.8025820817331618, "learning_rate": 7.853403141361257e-06, "loss": 0.7831, "step": 30 }, { "epoch": 0.048818897637795275, "grad_norm": 0.7560726281903514, "learning_rate": 8.115183246073298e-06, "loss": 0.782, "step": 31 }, { "epoch": 0.050393700787401574, "grad_norm": 0.9346301256819481, "learning_rate": 8.37696335078534e-06, "loss": 0.7771, "step": 32 }, { "epoch": 0.05196850393700787, "grad_norm": 0.8626580736392985, "learning_rate": 8.638743455497382e-06, "loss": 0.7721, "step": 33 }, { "epoch": 0.05354330708661417, "grad_norm": 0.543692907726409, "learning_rate": 8.900523560209424e-06, "loss": 0.7225, "step": 34 }, { "epoch": 0.05511811023622047, "grad_norm": 0.6197689896928392, "learning_rate": 9.162303664921467e-06, "loss": 0.7345, "step": 35 }, { "epoch": 0.05669291338582677, "grad_norm": 0.7221335582845992, "learning_rate": 9.424083769633508e-06, "loss": 0.7291, "step": 36 }, { "epoch": 0.05826771653543307, "grad_norm": 0.5907163031717492, "learning_rate": 9.68586387434555e-06, "loss": 0.6967, "step": 37 }, { "epoch": 0.05984251968503937, "grad_norm": 0.5073888906371201, "learning_rate": 9.947643979057591e-06, "loss": 0.6742, "step": 38 }, { "epoch": 0.06141732283464567, "grad_norm": 0.4742995157826702, "learning_rate": 1.0209424083769634e-05, "loss": 0.6993, "step": 39 }, { "epoch": 0.06299212598425197, "grad_norm": 0.5254229766121105, "learning_rate": 1.0471204188481676e-05, "loss": 0.6836, "step": 40 }, { "epoch": 0.06456692913385827, "grad_norm": 0.5701697911776024, "learning_rate": 1.0732984293193717e-05, "loss": 0.6538, "step": 41 }, { "epoch": 0.06614173228346457, "grad_norm": 0.5655595000359332, "learning_rate": 1.099476439790576e-05, "loss": 0.6827, "step": 42 }, { "epoch": 0.06771653543307087, "grad_norm": 0.46079546638837277, "learning_rate": 1.12565445026178e-05, "loss": 0.6692, "step": 43 }, { "epoch": 0.06929133858267716, "grad_norm": 0.3892739513553596, "learning_rate": 1.1518324607329843e-05, "loss": 0.6661, "step": 44 }, { "epoch": 0.07086614173228346, "grad_norm": 0.5210248117683633, "learning_rate": 1.1780104712041886e-05, "loss": 0.6825, "step": 45 }, { "epoch": 0.07244094488188976, "grad_norm": 0.473670074138589, "learning_rate": 1.2041884816753927e-05, "loss": 0.6313, "step": 46 }, { "epoch": 0.07401574803149606, "grad_norm": 0.48197346396386237, "learning_rate": 1.230366492146597e-05, "loss": 0.6436, "step": 47 }, { "epoch": 0.07559055118110236, "grad_norm": 0.35609101123809805, "learning_rate": 1.256544502617801e-05, "loss": 0.6248, "step": 48 }, { "epoch": 0.07716535433070866, "grad_norm": 0.363765531225608, "learning_rate": 1.2827225130890053e-05, "loss": 0.636, "step": 49 }, { "epoch": 0.07874015748031496, "grad_norm": 0.40689394220556785, "learning_rate": 1.3089005235602096e-05, "loss": 0.6269, "step": 50 }, { "epoch": 0.08031496062992126, "grad_norm": 0.4195688876631913, "learning_rate": 1.3350785340314136e-05, "loss": 0.6207, "step": 51 }, { "epoch": 0.08188976377952756, "grad_norm": 0.31798671853309196, "learning_rate": 1.3612565445026179e-05, "loss": 0.5821, "step": 52 }, { "epoch": 0.08346456692913386, "grad_norm": 0.315449080188357, "learning_rate": 1.3874345549738222e-05, "loss": 0.6145, "step": 53 }, { "epoch": 0.08503937007874016, "grad_norm": 0.3744023165649201, "learning_rate": 1.4136125654450264e-05, "loss": 0.6208, "step": 54 }, { "epoch": 0.08661417322834646, "grad_norm": 0.33199793463760763, "learning_rate": 1.4397905759162305e-05, "loss": 0.6147, "step": 55 }, { "epoch": 0.08818897637795275, "grad_norm": 0.29470332242700437, "learning_rate": 1.4659685863874344e-05, "loss": 0.6017, "step": 56 }, { "epoch": 0.08976377952755905, "grad_norm": 0.3162334410992618, "learning_rate": 1.4921465968586387e-05, "loss": 0.5808, "step": 57 }, { "epoch": 0.09133858267716535, "grad_norm": 0.37196534453662017, "learning_rate": 1.518324607329843e-05, "loss": 0.609, "step": 58 }, { "epoch": 0.09291338582677165, "grad_norm": 0.31718666768848436, "learning_rate": 1.5445026178010472e-05, "loss": 0.5742, "step": 59 }, { "epoch": 0.09448818897637795, "grad_norm": 0.29578540671028547, "learning_rate": 1.5706806282722515e-05, "loss": 0.5784, "step": 60 }, { "epoch": 0.09606299212598425, "grad_norm": 0.2971911693070281, "learning_rate": 1.5968586387434557e-05, "loss": 0.5622, "step": 61 }, { "epoch": 0.09763779527559055, "grad_norm": 0.2925052381574462, "learning_rate": 1.6230366492146596e-05, "loss": 0.5849, "step": 62 }, { "epoch": 0.09921259842519685, "grad_norm": 0.29815867636100946, "learning_rate": 1.649214659685864e-05, "loss": 0.5741, "step": 63 }, { "epoch": 0.10078740157480315, "grad_norm": 0.2634136551427692, "learning_rate": 1.675392670157068e-05, "loss": 0.5671, "step": 64 }, { "epoch": 0.10236220472440945, "grad_norm": 0.2742673409789681, "learning_rate": 1.7015706806282724e-05, "loss": 0.5718, "step": 65 }, { "epoch": 0.10393700787401575, "grad_norm": 0.2987205638696717, "learning_rate": 1.7277486910994763e-05, "loss": 0.5612, "step": 66 }, { "epoch": 0.10551181102362205, "grad_norm": 0.31237039392235566, "learning_rate": 1.7539267015706806e-05, "loss": 0.5728, "step": 67 }, { "epoch": 0.10708661417322834, "grad_norm": 0.2839775778299326, "learning_rate": 1.780104712041885e-05, "loss": 0.5685, "step": 68 }, { "epoch": 0.10866141732283464, "grad_norm": 0.2715506566902656, "learning_rate": 1.806282722513089e-05, "loss": 0.5662, "step": 69 }, { "epoch": 0.11023622047244094, "grad_norm": 0.2769107411394044, "learning_rate": 1.8324607329842934e-05, "loss": 0.5612, "step": 70 }, { "epoch": 0.11181102362204724, "grad_norm": 0.2704588628786121, "learning_rate": 1.8586387434554976e-05, "loss": 0.5585, "step": 71 }, { "epoch": 0.11338582677165354, "grad_norm": 0.2739876779438809, "learning_rate": 1.8848167539267016e-05, "loss": 0.5749, "step": 72 }, { "epoch": 0.11496062992125984, "grad_norm": 0.3014215584381662, "learning_rate": 1.9109947643979058e-05, "loss": 0.5631, "step": 73 }, { "epoch": 0.11653543307086614, "grad_norm": 0.2882626903282301, "learning_rate": 1.93717277486911e-05, "loss": 0.5837, "step": 74 }, { "epoch": 0.11811023622047244, "grad_norm": 0.313239549213018, "learning_rate": 1.9633507853403143e-05, "loss": 0.5824, "step": 75 }, { "epoch": 0.11968503937007874, "grad_norm": 0.26564326625571916, "learning_rate": 1.9895287958115183e-05, "loss": 0.5444, "step": 76 }, { "epoch": 0.12125984251968504, "grad_norm": 0.2793381652274757, "learning_rate": 2.0157068062827225e-05, "loss": 0.5312, "step": 77 }, { "epoch": 0.12283464566929134, "grad_norm": 0.26638922903704626, "learning_rate": 2.0418848167539268e-05, "loss": 0.5119, "step": 78 }, { "epoch": 0.12440944881889764, "grad_norm": 0.30140453593014654, "learning_rate": 2.068062827225131e-05, "loss": 0.5491, "step": 79 }, { "epoch": 0.12598425196850394, "grad_norm": 0.28618663531030547, "learning_rate": 2.0942408376963353e-05, "loss": 0.5355, "step": 80 }, { "epoch": 0.12755905511811025, "grad_norm": 0.2798358831535125, "learning_rate": 2.1204188481675396e-05, "loss": 0.5469, "step": 81 }, { "epoch": 0.12913385826771653, "grad_norm": 0.30271354974945536, "learning_rate": 2.1465968586387435e-05, "loss": 0.5388, "step": 82 }, { "epoch": 0.13070866141732285, "grad_norm": 0.3236935213111507, "learning_rate": 2.1727748691099477e-05, "loss": 0.5278, "step": 83 }, { "epoch": 0.13228346456692913, "grad_norm": 0.3505410779584772, "learning_rate": 2.198952879581152e-05, "loss": 0.5385, "step": 84 }, { "epoch": 0.13385826771653545, "grad_norm": 0.3354850513385721, "learning_rate": 2.2251308900523562e-05, "loss": 0.5353, "step": 85 }, { "epoch": 0.13543307086614173, "grad_norm": 0.31199810391909794, "learning_rate": 2.25130890052356e-05, "loss": 0.5508, "step": 86 }, { "epoch": 0.13700787401574804, "grad_norm": 0.32764291599912737, "learning_rate": 2.2774869109947644e-05, "loss": 0.532, "step": 87 }, { "epoch": 0.13858267716535433, "grad_norm": 0.34335622412123845, "learning_rate": 2.3036649214659687e-05, "loss": 0.5314, "step": 88 }, { "epoch": 0.14015748031496064, "grad_norm": 0.30341557844620426, "learning_rate": 2.329842931937173e-05, "loss": 0.5597, "step": 89 }, { "epoch": 0.14173228346456693, "grad_norm": 0.3616290621271917, "learning_rate": 2.3560209424083772e-05, "loss": 0.5144, "step": 90 }, { "epoch": 0.14330708661417324, "grad_norm": 0.34567633958067157, "learning_rate": 2.382198952879581e-05, "loss": 0.5357, "step": 91 }, { "epoch": 0.14488188976377953, "grad_norm": 0.5438690267485856, "learning_rate": 2.4083769633507854e-05, "loss": 0.5056, "step": 92 }, { "epoch": 0.14645669291338584, "grad_norm": 0.3170016605864192, "learning_rate": 2.4345549738219896e-05, "loss": 0.52, "step": 93 }, { "epoch": 0.14803149606299212, "grad_norm": 0.3993596384365677, "learning_rate": 2.460732984293194e-05, "loss": 0.5292, "step": 94 }, { "epoch": 0.14960629921259844, "grad_norm": 0.3146306507337827, "learning_rate": 2.486910994764398e-05, "loss": 0.5242, "step": 95 }, { "epoch": 0.15118110236220472, "grad_norm": 0.35274994698597095, "learning_rate": 2.513089005235602e-05, "loss": 0.5002, "step": 96 }, { "epoch": 0.15275590551181104, "grad_norm": 0.3164991031226573, "learning_rate": 2.5392670157068067e-05, "loss": 0.5289, "step": 97 }, { "epoch": 0.15433070866141732, "grad_norm": 0.30985698261473926, "learning_rate": 2.5654450261780106e-05, "loss": 0.5262, "step": 98 }, { "epoch": 0.15590551181102363, "grad_norm": 0.34499150113204824, "learning_rate": 2.591623036649215e-05, "loss": 0.5167, "step": 99 }, { "epoch": 0.15748031496062992, "grad_norm": 0.30166810492362905, "learning_rate": 2.617801047120419e-05, "loss": 0.4963, "step": 100 }, { "epoch": 0.15905511811023623, "grad_norm": 0.33675170392703563, "learning_rate": 2.643979057591623e-05, "loss": 0.5276, "step": 101 }, { "epoch": 0.16062992125984252, "grad_norm": 0.32152703692651075, "learning_rate": 2.6701570680628273e-05, "loss": 0.512, "step": 102 }, { "epoch": 0.16220472440944883, "grad_norm": 0.3136154802649583, "learning_rate": 2.6963350785340312e-05, "loss": 0.5139, "step": 103 }, { "epoch": 0.16377952755905512, "grad_norm": 0.30705693250603244, "learning_rate": 2.7225130890052358e-05, "loss": 0.5215, "step": 104 }, { "epoch": 0.16535433070866143, "grad_norm": 0.32302565158083707, "learning_rate": 2.7486910994764397e-05, "loss": 0.5273, "step": 105 }, { "epoch": 0.16692913385826771, "grad_norm": 0.32836597486406005, "learning_rate": 2.7748691099476443e-05, "loss": 0.543, "step": 106 }, { "epoch": 0.16850393700787403, "grad_norm": 0.34804956276283144, "learning_rate": 2.8010471204188483e-05, "loss": 0.5068, "step": 107 }, { "epoch": 0.1700787401574803, "grad_norm": 0.28339067362062914, "learning_rate": 2.827225130890053e-05, "loss": 0.4943, "step": 108 }, { "epoch": 0.17165354330708663, "grad_norm": 0.3373878790259843, "learning_rate": 2.8534031413612568e-05, "loss": 0.5045, "step": 109 }, { "epoch": 0.1732283464566929, "grad_norm": 0.3106890142526734, "learning_rate": 2.879581151832461e-05, "loss": 0.5222, "step": 110 }, { "epoch": 0.17480314960629922, "grad_norm": 0.3826244810079641, "learning_rate": 2.905759162303665e-05, "loss": 0.5055, "step": 111 }, { "epoch": 0.1763779527559055, "grad_norm": 0.3257335325900019, "learning_rate": 2.931937172774869e-05, "loss": 0.5071, "step": 112 }, { "epoch": 0.17795275590551182, "grad_norm": 0.41191961152065965, "learning_rate": 2.9581151832460735e-05, "loss": 0.4971, "step": 113 }, { "epoch": 0.1795275590551181, "grad_norm": 0.37596820445874285, "learning_rate": 2.9842931937172774e-05, "loss": 0.5044, "step": 114 }, { "epoch": 0.18110236220472442, "grad_norm": 0.3496426985872993, "learning_rate": 3.010471204188482e-05, "loss": 0.5064, "step": 115 }, { "epoch": 0.1826771653543307, "grad_norm": 0.36031420723062446, "learning_rate": 3.036649214659686e-05, "loss": 0.511, "step": 116 }, { "epoch": 0.18425196850393702, "grad_norm": 0.35121187026599077, "learning_rate": 3.0628272251308905e-05, "loss": 0.5071, "step": 117 }, { "epoch": 0.1858267716535433, "grad_norm": 0.4734458689417803, "learning_rate": 3.0890052356020944e-05, "loss": 0.4972, "step": 118 }, { "epoch": 0.18740157480314962, "grad_norm": 0.34129586174466087, "learning_rate": 3.115183246073299e-05, "loss": 0.4838, "step": 119 }, { "epoch": 0.1889763779527559, "grad_norm": 0.413457311673766, "learning_rate": 3.141361256544503e-05, "loss": 0.5151, "step": 120 }, { "epoch": 0.19055118110236222, "grad_norm": 0.3881292496052053, "learning_rate": 3.167539267015707e-05, "loss": 0.5145, "step": 121 }, { "epoch": 0.1921259842519685, "grad_norm": 0.4592284384341846, "learning_rate": 3.1937172774869115e-05, "loss": 0.4895, "step": 122 }, { "epoch": 0.19370078740157481, "grad_norm": 0.4516161248195763, "learning_rate": 3.2198952879581154e-05, "loss": 0.4821, "step": 123 }, { "epoch": 0.1952755905511811, "grad_norm": 0.3633925327906424, "learning_rate": 3.246073298429319e-05, "loss": 0.516, "step": 124 }, { "epoch": 0.1968503937007874, "grad_norm": 0.39164284116238496, "learning_rate": 3.272251308900524e-05, "loss": 0.5112, "step": 125 }, { "epoch": 0.1984251968503937, "grad_norm": 0.3819720800221838, "learning_rate": 3.298429319371728e-05, "loss": 0.4722, "step": 126 }, { "epoch": 0.2, "grad_norm": 0.4095680430006105, "learning_rate": 3.324607329842932e-05, "loss": 0.5001, "step": 127 }, { "epoch": 0.2015748031496063, "grad_norm": 0.32706290656482556, "learning_rate": 3.350785340314136e-05, "loss": 0.5079, "step": 128 }, { "epoch": 0.2031496062992126, "grad_norm": 0.37070879127351747, "learning_rate": 3.37696335078534e-05, "loss": 0.5304, "step": 129 }, { "epoch": 0.2047244094488189, "grad_norm": 0.3324372227070079, "learning_rate": 3.403141361256545e-05, "loss": 0.4992, "step": 130 }, { "epoch": 0.2062992125984252, "grad_norm": 0.42202360241788117, "learning_rate": 3.429319371727749e-05, "loss": 0.4923, "step": 131 }, { "epoch": 0.2078740157480315, "grad_norm": 0.3655731891948839, "learning_rate": 3.455497382198953e-05, "loss": 0.4912, "step": 132 }, { "epoch": 0.2094488188976378, "grad_norm": 0.3512599070028766, "learning_rate": 3.481675392670157e-05, "loss": 0.5056, "step": 133 }, { "epoch": 0.2110236220472441, "grad_norm": 0.3517722916565129, "learning_rate": 3.507853403141361e-05, "loss": 0.4924, "step": 134 }, { "epoch": 0.2125984251968504, "grad_norm": 0.3698834665856276, "learning_rate": 3.534031413612566e-05, "loss": 0.4947, "step": 135 }, { "epoch": 0.2141732283464567, "grad_norm": 0.39134605299242614, "learning_rate": 3.56020942408377e-05, "loss": 0.508, "step": 136 }, { "epoch": 0.215748031496063, "grad_norm": 0.3655766665560841, "learning_rate": 3.586387434554974e-05, "loss": 0.5249, "step": 137 }, { "epoch": 0.2173228346456693, "grad_norm": 0.37539947285598607, "learning_rate": 3.612565445026178e-05, "loss": 0.4954, "step": 138 }, { "epoch": 0.2188976377952756, "grad_norm": 0.343886593816041, "learning_rate": 3.638743455497383e-05, "loss": 0.4955, "step": 139 }, { "epoch": 0.2204724409448819, "grad_norm": 0.38886007548015894, "learning_rate": 3.664921465968587e-05, "loss": 0.4878, "step": 140 }, { "epoch": 0.2220472440944882, "grad_norm": 0.3737612303721575, "learning_rate": 3.691099476439791e-05, "loss": 0.4797, "step": 141 }, { "epoch": 0.22362204724409449, "grad_norm": 0.42321954303899617, "learning_rate": 3.717277486910995e-05, "loss": 0.4761, "step": 142 }, { "epoch": 0.2251968503937008, "grad_norm": 0.4096832118560926, "learning_rate": 3.743455497382199e-05, "loss": 0.5216, "step": 143 }, { "epoch": 0.22677165354330708, "grad_norm": 0.400074085585784, "learning_rate": 3.769633507853403e-05, "loss": 0.4882, "step": 144 }, { "epoch": 0.2283464566929134, "grad_norm": 0.3562870734712635, "learning_rate": 3.795811518324607e-05, "loss": 0.4995, "step": 145 }, { "epoch": 0.22992125984251968, "grad_norm": 0.41426021044933997, "learning_rate": 3.8219895287958116e-05, "loss": 0.483, "step": 146 }, { "epoch": 0.231496062992126, "grad_norm": 0.38783675543270685, "learning_rate": 3.8481675392670156e-05, "loss": 0.4654, "step": 147 }, { "epoch": 0.23307086614173228, "grad_norm": 0.38902666230171484, "learning_rate": 3.87434554973822e-05, "loss": 0.4901, "step": 148 }, { "epoch": 0.2346456692913386, "grad_norm": 0.3550733540839511, "learning_rate": 3.900523560209424e-05, "loss": 0.4804, "step": 149 }, { "epoch": 0.23622047244094488, "grad_norm": 0.48741514109106465, "learning_rate": 3.926701570680629e-05, "loss": 0.4865, "step": 150 }, { "epoch": 0.2377952755905512, "grad_norm": 0.38700103438035643, "learning_rate": 3.9528795811518326e-05, "loss": 0.4955, "step": 151 }, { "epoch": 0.23937007874015748, "grad_norm": 0.495349359186258, "learning_rate": 3.9790575916230365e-05, "loss": 0.5054, "step": 152 }, { "epoch": 0.2409448818897638, "grad_norm": 0.39736160846061525, "learning_rate": 4.005235602094241e-05, "loss": 0.4846, "step": 153 }, { "epoch": 0.24251968503937008, "grad_norm": 0.47005064843856353, "learning_rate": 4.031413612565445e-05, "loss": 0.4913, "step": 154 }, { "epoch": 0.2440944881889764, "grad_norm": 0.45660373906394774, "learning_rate": 4.0575916230366496e-05, "loss": 0.4763, "step": 155 }, { "epoch": 0.24566929133858267, "grad_norm": 0.44361903935628005, "learning_rate": 4.0837696335078535e-05, "loss": 0.4845, "step": 156 }, { "epoch": 0.247244094488189, "grad_norm": 0.37820017236943354, "learning_rate": 4.109947643979058e-05, "loss": 0.4673, "step": 157 }, { "epoch": 0.24881889763779527, "grad_norm": 0.4254175737313103, "learning_rate": 4.136125654450262e-05, "loss": 0.4854, "step": 158 }, { "epoch": 0.2503937007874016, "grad_norm": 0.3799019981434027, "learning_rate": 4.162303664921467e-05, "loss": 0.5035, "step": 159 }, { "epoch": 0.25196850393700787, "grad_norm": 0.4013474204194146, "learning_rate": 4.1884816753926706e-05, "loss": 0.4916, "step": 160 }, { "epoch": 0.25354330708661416, "grad_norm": 0.3850277633727475, "learning_rate": 4.2146596858638745e-05, "loss": 0.4926, "step": 161 }, { "epoch": 0.2551181102362205, "grad_norm": 0.42991618823449007, "learning_rate": 4.240837696335079e-05, "loss": 0.4875, "step": 162 }, { "epoch": 0.2566929133858268, "grad_norm": 0.4413912870427857, "learning_rate": 4.267015706806283e-05, "loss": 0.4688, "step": 163 }, { "epoch": 0.25826771653543307, "grad_norm": 0.3777990556345731, "learning_rate": 4.293193717277487e-05, "loss": 0.5086, "step": 164 }, { "epoch": 0.25984251968503935, "grad_norm": 0.510378355469851, "learning_rate": 4.319371727748691e-05, "loss": 0.502, "step": 165 }, { "epoch": 0.2614173228346457, "grad_norm": 0.5150942084675197, "learning_rate": 4.3455497382198955e-05, "loss": 0.5141, "step": 166 }, { "epoch": 0.262992125984252, "grad_norm": 0.4321803797640845, "learning_rate": 4.3717277486910994e-05, "loss": 0.4953, "step": 167 }, { "epoch": 0.26456692913385826, "grad_norm": 0.47697796040873763, "learning_rate": 4.397905759162304e-05, "loss": 0.4977, "step": 168 }, { "epoch": 0.26614173228346455, "grad_norm": 0.43906072160294285, "learning_rate": 4.424083769633508e-05, "loss": 0.4818, "step": 169 }, { "epoch": 0.2677165354330709, "grad_norm": 0.5712261877752867, "learning_rate": 4.4502617801047125e-05, "loss": 0.4835, "step": 170 }, { "epoch": 0.2692913385826772, "grad_norm": 0.4663550383512649, "learning_rate": 4.4764397905759164e-05, "loss": 0.4949, "step": 171 }, { "epoch": 0.27086614173228346, "grad_norm": 0.48387737997433045, "learning_rate": 4.50261780104712e-05, "loss": 0.4833, "step": 172 }, { "epoch": 0.27244094488188975, "grad_norm": 0.4605936469874428, "learning_rate": 4.528795811518325e-05, "loss": 0.4651, "step": 173 }, { "epoch": 0.2740157480314961, "grad_norm": 0.39278894287622046, "learning_rate": 4.554973821989529e-05, "loss": 0.4567, "step": 174 }, { "epoch": 0.2755905511811024, "grad_norm": 0.444285689390101, "learning_rate": 4.5811518324607335e-05, "loss": 0.4856, "step": 175 }, { "epoch": 0.27716535433070866, "grad_norm": 0.45982926566749366, "learning_rate": 4.6073298429319374e-05, "loss": 0.4692, "step": 176 }, { "epoch": 0.27874015748031494, "grad_norm": 0.4785830488488037, "learning_rate": 4.633507853403142e-05, "loss": 0.5012, "step": 177 }, { "epoch": 0.2803149606299213, "grad_norm": 0.5388079418279662, "learning_rate": 4.659685863874346e-05, "loss": 0.5016, "step": 178 }, { "epoch": 0.28188976377952757, "grad_norm": 0.48870486318601203, "learning_rate": 4.6858638743455505e-05, "loss": 0.4975, "step": 179 }, { "epoch": 0.28346456692913385, "grad_norm": 0.5001055430994098, "learning_rate": 4.7120418848167544e-05, "loss": 0.4737, "step": 180 }, { "epoch": 0.28503937007874014, "grad_norm": 0.43499986431163545, "learning_rate": 4.738219895287958e-05, "loss": 0.4664, "step": 181 }, { "epoch": 0.2866141732283465, "grad_norm": 0.4085084941527195, "learning_rate": 4.764397905759162e-05, "loss": 0.4893, "step": 182 }, { "epoch": 0.28818897637795277, "grad_norm": 0.6613182804233656, "learning_rate": 4.790575916230366e-05, "loss": 0.4844, "step": 183 }, { "epoch": 0.28976377952755905, "grad_norm": 0.5962478911229055, "learning_rate": 4.816753926701571e-05, "loss": 0.4933, "step": 184 }, { "epoch": 0.29133858267716534, "grad_norm": 0.4665668516526072, "learning_rate": 4.842931937172775e-05, "loss": 0.4563, "step": 185 }, { "epoch": 0.2929133858267717, "grad_norm": 0.8219825279266331, "learning_rate": 4.869109947643979e-05, "loss": 0.4644, "step": 186 }, { "epoch": 0.29448818897637796, "grad_norm": 0.4859081594803487, "learning_rate": 4.895287958115183e-05, "loss": 0.4736, "step": 187 }, { "epoch": 0.29606299212598425, "grad_norm": 0.6078728595358611, "learning_rate": 4.921465968586388e-05, "loss": 0.4801, "step": 188 }, { "epoch": 0.29763779527559053, "grad_norm": 0.5784681877842144, "learning_rate": 4.947643979057592e-05, "loss": 0.482, "step": 189 }, { "epoch": 0.2992125984251969, "grad_norm": 0.4827628160814336, "learning_rate": 4.973821989528796e-05, "loss": 0.4923, "step": 190 }, { "epoch": 0.30078740157480316, "grad_norm": 0.4833944100899588, "learning_rate": 5e-05, "loss": 0.4939, "step": 191 }, { "epoch": 0.30236220472440944, "grad_norm": 0.8199919226312089, "learning_rate": 4.997082847141191e-05, "loss": 0.4911, "step": 192 }, { "epoch": 0.30393700787401573, "grad_norm": 0.6005632020114945, "learning_rate": 4.9941656942823805e-05, "loss": 0.4725, "step": 193 }, { "epoch": 0.30551181102362207, "grad_norm": 0.4079424880191628, "learning_rate": 4.991248541423571e-05, "loss": 0.48, "step": 194 }, { "epoch": 0.30708661417322836, "grad_norm": 0.7548870352198, "learning_rate": 4.988331388564761e-05, "loss": 0.4659, "step": 195 }, { "epoch": 0.30866141732283464, "grad_norm": 1.1514845015553656, "learning_rate": 4.985414235705951e-05, "loss": 0.5062, "step": 196 }, { "epoch": 0.3102362204724409, "grad_norm": 0.4054172109545723, "learning_rate": 4.9824970828471416e-05, "loss": 0.4693, "step": 197 }, { "epoch": 0.31181102362204727, "grad_norm": 0.6553156694727401, "learning_rate": 4.979579929988331e-05, "loss": 0.4583, "step": 198 }, { "epoch": 0.31338582677165355, "grad_norm": 0.5821708181943103, "learning_rate": 4.976662777129522e-05, "loss": 0.4574, "step": 199 }, { "epoch": 0.31496062992125984, "grad_norm": 0.47195388248790854, "learning_rate": 4.973745624270712e-05, "loss": 0.4763, "step": 200 }, { "epoch": 0.3165354330708661, "grad_norm": 0.5690243972135179, "learning_rate": 4.970828471411903e-05, "loss": 0.4716, "step": 201 }, { "epoch": 0.31811023622047246, "grad_norm": 0.5592916353429496, "learning_rate": 4.9679113185530924e-05, "loss": 0.4727, "step": 202 }, { "epoch": 0.31968503937007875, "grad_norm": 0.4156247351566324, "learning_rate": 4.964994165694283e-05, "loss": 0.4704, "step": 203 }, { "epoch": 0.32125984251968503, "grad_norm": 0.4503083173794489, "learning_rate": 4.9620770128354727e-05, "loss": 0.4782, "step": 204 }, { "epoch": 0.3228346456692913, "grad_norm": 0.42380667068526306, "learning_rate": 4.959159859976663e-05, "loss": 0.4874, "step": 205 }, { "epoch": 0.32440944881889766, "grad_norm": 0.3930296962408744, "learning_rate": 4.956242707117853e-05, "loss": 0.4842, "step": 206 }, { "epoch": 0.32598425196850395, "grad_norm": 0.47610502179422337, "learning_rate": 4.953325554259043e-05, "loss": 0.4881, "step": 207 }, { "epoch": 0.32755905511811023, "grad_norm": 0.4447817912708727, "learning_rate": 4.950408401400234e-05, "loss": 0.4611, "step": 208 }, { "epoch": 0.3291338582677165, "grad_norm": 0.5171955991889444, "learning_rate": 4.9474912485414235e-05, "loss": 0.4595, "step": 209 }, { "epoch": 0.33070866141732286, "grad_norm": 0.495272786726448, "learning_rate": 4.944574095682614e-05, "loss": 0.4661, "step": 210 }, { "epoch": 0.33228346456692914, "grad_norm": 0.5159637698802905, "learning_rate": 4.941656942823804e-05, "loss": 0.4758, "step": 211 }, { "epoch": 0.33385826771653543, "grad_norm": 0.6726802350102449, "learning_rate": 4.938739789964994e-05, "loss": 0.4836, "step": 212 }, { "epoch": 0.3354330708661417, "grad_norm": 0.5724770686659812, "learning_rate": 4.9358226371061847e-05, "loss": 0.463, "step": 213 }, { "epoch": 0.33700787401574805, "grad_norm": 0.49800748368418746, "learning_rate": 4.932905484247375e-05, "loss": 0.4628, "step": 214 }, { "epoch": 0.33858267716535434, "grad_norm": 0.6699236624844588, "learning_rate": 4.929988331388565e-05, "loss": 0.4603, "step": 215 }, { "epoch": 0.3401574803149606, "grad_norm": 0.4881066835466793, "learning_rate": 4.927071178529755e-05, "loss": 0.4581, "step": 216 }, { "epoch": 0.3417322834645669, "grad_norm": 0.45611643342891534, "learning_rate": 4.924154025670946e-05, "loss": 0.4894, "step": 217 }, { "epoch": 0.34330708661417325, "grad_norm": 0.5258914351861229, "learning_rate": 4.9212368728121355e-05, "loss": 0.4659, "step": 218 }, { "epoch": 0.34488188976377954, "grad_norm": 0.44404211752382583, "learning_rate": 4.918319719953326e-05, "loss": 0.4652, "step": 219 }, { "epoch": 0.3464566929133858, "grad_norm": 0.48418782318627507, "learning_rate": 4.915402567094516e-05, "loss": 0.4971, "step": 220 }, { "epoch": 0.3480314960629921, "grad_norm": 0.651666405431008, "learning_rate": 4.912485414235706e-05, "loss": 0.4775, "step": 221 }, { "epoch": 0.34960629921259845, "grad_norm": 0.5661253731562232, "learning_rate": 4.909568261376896e-05, "loss": 0.475, "step": 222 }, { "epoch": 0.35118110236220473, "grad_norm": 0.4438239433990005, "learning_rate": 4.9066511085180864e-05, "loss": 0.4969, "step": 223 }, { "epoch": 0.352755905511811, "grad_norm": 0.5458225887484156, "learning_rate": 4.903733955659276e-05, "loss": 0.4782, "step": 224 }, { "epoch": 0.3543307086614173, "grad_norm": 0.5508140066370417, "learning_rate": 4.9008168028004666e-05, "loss": 0.4996, "step": 225 }, { "epoch": 0.35590551181102364, "grad_norm": 0.3881330639770717, "learning_rate": 4.897899649941657e-05, "loss": 0.4777, "step": 226 }, { "epoch": 0.35748031496062993, "grad_norm": 0.5400254277751622, "learning_rate": 4.8949824970828475e-05, "loss": 0.4719, "step": 227 }, { "epoch": 0.3590551181102362, "grad_norm": 0.5163252675100456, "learning_rate": 4.892065344224038e-05, "loss": 0.4747, "step": 228 }, { "epoch": 0.3606299212598425, "grad_norm": 0.40758189609215206, "learning_rate": 4.889148191365228e-05, "loss": 0.4662, "step": 229 }, { "epoch": 0.36220472440944884, "grad_norm": 0.6637421475606932, "learning_rate": 4.886231038506418e-05, "loss": 0.4764, "step": 230 }, { "epoch": 0.3637795275590551, "grad_norm": 0.5876446305513081, "learning_rate": 4.883313885647608e-05, "loss": 0.4798, "step": 231 }, { "epoch": 0.3653543307086614, "grad_norm": 0.46204548150696984, "learning_rate": 4.8803967327887984e-05, "loss": 0.4674, "step": 232 }, { "epoch": 0.3669291338582677, "grad_norm": 0.7421193127459422, "learning_rate": 4.877479579929989e-05, "loss": 0.4848, "step": 233 }, { "epoch": 0.36850393700787404, "grad_norm": 0.7330173443025827, "learning_rate": 4.8745624270711786e-05, "loss": 0.4672, "step": 234 }, { "epoch": 0.3700787401574803, "grad_norm": 0.6449601402184486, "learning_rate": 4.871645274212369e-05, "loss": 0.463, "step": 235 }, { "epoch": 0.3716535433070866, "grad_norm": 0.8453489229504224, "learning_rate": 4.868728121353559e-05, "loss": 0.4782, "step": 236 }, { "epoch": 0.3732283464566929, "grad_norm": 0.7351121865493017, "learning_rate": 4.865810968494749e-05, "loss": 0.4674, "step": 237 }, { "epoch": 0.37480314960629924, "grad_norm": 0.8686382728121524, "learning_rate": 4.862893815635939e-05, "loss": 0.4775, "step": 238 }, { "epoch": 0.3763779527559055, "grad_norm": 0.5506972691892729, "learning_rate": 4.8599766627771295e-05, "loss": 0.4749, "step": 239 }, { "epoch": 0.3779527559055118, "grad_norm": 0.865315151599351, "learning_rate": 4.85705950991832e-05, "loss": 0.4655, "step": 240 }, { "epoch": 0.3795275590551181, "grad_norm": 0.5406280566649008, "learning_rate": 4.8541423570595104e-05, "loss": 0.4475, "step": 241 }, { "epoch": 0.38110236220472443, "grad_norm": 0.6956996459014997, "learning_rate": 4.851225204200701e-05, "loss": 0.4658, "step": 242 }, { "epoch": 0.3826771653543307, "grad_norm": 0.6860286802509643, "learning_rate": 4.8483080513418906e-05, "loss": 0.4607, "step": 243 }, { "epoch": 0.384251968503937, "grad_norm": 0.6178662751565125, "learning_rate": 4.845390898483081e-05, "loss": 0.4584, "step": 244 }, { "epoch": 0.3858267716535433, "grad_norm": 0.8112234374871194, "learning_rate": 4.842473745624271e-05, "loss": 0.4532, "step": 245 }, { "epoch": 0.38740157480314963, "grad_norm": 0.47619853878604423, "learning_rate": 4.839556592765461e-05, "loss": 0.4604, "step": 246 }, { "epoch": 0.3889763779527559, "grad_norm": 0.7656660258488839, "learning_rate": 4.836639439906651e-05, "loss": 0.4769, "step": 247 }, { "epoch": 0.3905511811023622, "grad_norm": 0.47088665820724407, "learning_rate": 4.8337222870478415e-05, "loss": 0.4722, "step": 248 }, { "epoch": 0.3921259842519685, "grad_norm": 0.6573735802268115, "learning_rate": 4.830805134189031e-05, "loss": 0.4683, "step": 249 }, { "epoch": 0.3937007874015748, "grad_norm": 0.6986346788440789, "learning_rate": 4.827887981330222e-05, "loss": 0.4683, "step": 250 }, { "epoch": 0.3952755905511811, "grad_norm": 0.5309588190989775, "learning_rate": 4.824970828471412e-05, "loss": 0.4771, "step": 251 }, { "epoch": 0.3968503937007874, "grad_norm": 0.567276971215343, "learning_rate": 4.822053675612602e-05, "loss": 0.4573, "step": 252 }, { "epoch": 0.3984251968503937, "grad_norm": 0.4479449981761639, "learning_rate": 4.8191365227537924e-05, "loss": 0.4779, "step": 253 }, { "epoch": 0.4, "grad_norm": 0.4881403127748902, "learning_rate": 4.816219369894983e-05, "loss": 0.4557, "step": 254 }, { "epoch": 0.4015748031496063, "grad_norm": 0.46552345505194964, "learning_rate": 4.813302217036173e-05, "loss": 0.4572, "step": 255 }, { "epoch": 0.4031496062992126, "grad_norm": 0.36239163503177385, "learning_rate": 4.810385064177363e-05, "loss": 0.4687, "step": 256 }, { "epoch": 0.4047244094488189, "grad_norm": 0.5035230466928424, "learning_rate": 4.8074679113185535e-05, "loss": 0.4621, "step": 257 }, { "epoch": 0.4062992125984252, "grad_norm": 0.4367192075505084, "learning_rate": 4.804550758459744e-05, "loss": 0.4512, "step": 258 }, { "epoch": 0.4078740157480315, "grad_norm": 0.3997041902246384, "learning_rate": 4.801633605600934e-05, "loss": 0.4486, "step": 259 }, { "epoch": 0.4094488188976378, "grad_norm": 0.4470142624394144, "learning_rate": 4.798716452742124e-05, "loss": 0.4709, "step": 260 }, { "epoch": 0.4110236220472441, "grad_norm": 0.4475102933018223, "learning_rate": 4.795799299883314e-05, "loss": 0.479, "step": 261 }, { "epoch": 0.4125984251968504, "grad_norm": 0.5664925206319695, "learning_rate": 4.7928821470245044e-05, "loss": 0.4974, "step": 262 }, { "epoch": 0.4141732283464567, "grad_norm": 0.4016061854806556, "learning_rate": 4.789964994165694e-05, "loss": 0.4643, "step": 263 }, { "epoch": 0.415748031496063, "grad_norm": 0.43378984804807064, "learning_rate": 4.7870478413068846e-05, "loss": 0.4654, "step": 264 }, { "epoch": 0.41732283464566927, "grad_norm": 0.41582832155479615, "learning_rate": 4.784130688448074e-05, "loss": 0.4609, "step": 265 }, { "epoch": 0.4188976377952756, "grad_norm": 0.5298739348609497, "learning_rate": 4.781213535589265e-05, "loss": 0.4507, "step": 266 }, { "epoch": 0.4204724409448819, "grad_norm": 0.40059450530889984, "learning_rate": 4.778296382730455e-05, "loss": 0.4639, "step": 267 }, { "epoch": 0.4220472440944882, "grad_norm": 0.432451963252051, "learning_rate": 4.775379229871646e-05, "loss": 0.4821, "step": 268 }, { "epoch": 0.42362204724409447, "grad_norm": 0.4220486641907116, "learning_rate": 4.772462077012836e-05, "loss": 0.4392, "step": 269 }, { "epoch": 0.4251968503937008, "grad_norm": 0.4728292556109989, "learning_rate": 4.769544924154026e-05, "loss": 0.4644, "step": 270 }, { "epoch": 0.4267716535433071, "grad_norm": 0.37586110633383824, "learning_rate": 4.7666277712952163e-05, "loss": 0.4542, "step": 271 }, { "epoch": 0.4283464566929134, "grad_norm": 0.5297309918608535, "learning_rate": 4.763710618436406e-05, "loss": 0.4532, "step": 272 }, { "epoch": 0.42992125984251967, "grad_norm": 0.3994975270045104, "learning_rate": 4.7607934655775966e-05, "loss": 0.4597, "step": 273 }, { "epoch": 0.431496062992126, "grad_norm": 0.5353873955230466, "learning_rate": 4.757876312718786e-05, "loss": 0.4705, "step": 274 }, { "epoch": 0.4330708661417323, "grad_norm": 0.4888512441480693, "learning_rate": 4.754959159859977e-05, "loss": 0.4751, "step": 275 }, { "epoch": 0.4346456692913386, "grad_norm": 0.5250610083100687, "learning_rate": 4.752042007001167e-05, "loss": 0.4767, "step": 276 }, { "epoch": 0.43622047244094486, "grad_norm": 0.5071221096781429, "learning_rate": 4.749124854142357e-05, "loss": 0.4568, "step": 277 }, { "epoch": 0.4377952755905512, "grad_norm": 0.5465569798274295, "learning_rate": 4.7462077012835474e-05, "loss": 0.459, "step": 278 }, { "epoch": 0.4393700787401575, "grad_norm": 0.44109601109984736, "learning_rate": 4.743290548424737e-05, "loss": 0.4628, "step": 279 }, { "epoch": 0.4409448818897638, "grad_norm": 0.48550652319258814, "learning_rate": 4.7403733955659277e-05, "loss": 0.4491, "step": 280 }, { "epoch": 0.44251968503937006, "grad_norm": 0.5645885894621411, "learning_rate": 4.737456242707118e-05, "loss": 0.4528, "step": 281 }, { "epoch": 0.4440944881889764, "grad_norm": 0.3698057083942626, "learning_rate": 4.7345390898483085e-05, "loss": 0.4605, "step": 282 }, { "epoch": 0.4456692913385827, "grad_norm": 0.5773319809836301, "learning_rate": 4.731621936989499e-05, "loss": 0.4806, "step": 283 }, { "epoch": 0.44724409448818897, "grad_norm": 0.4538646603850104, "learning_rate": 4.728704784130689e-05, "loss": 0.4771, "step": 284 }, { "epoch": 0.44881889763779526, "grad_norm": 0.3998401915259715, "learning_rate": 4.725787631271879e-05, "loss": 0.4749, "step": 285 }, { "epoch": 0.4503937007874016, "grad_norm": 0.3599601090372482, "learning_rate": 4.722870478413069e-05, "loss": 0.4612, "step": 286 }, { "epoch": 0.4519685039370079, "grad_norm": 0.3788055134170955, "learning_rate": 4.7199533255542594e-05, "loss": 0.4658, "step": 287 }, { "epoch": 0.45354330708661417, "grad_norm": 0.3683985825112104, "learning_rate": 4.717036172695449e-05, "loss": 0.4372, "step": 288 }, { "epoch": 0.45511811023622045, "grad_norm": 0.3528740085409915, "learning_rate": 4.7141190198366396e-05, "loss": 0.4622, "step": 289 }, { "epoch": 0.4566929133858268, "grad_norm": 0.3247812402406993, "learning_rate": 4.7112018669778294e-05, "loss": 0.4486, "step": 290 }, { "epoch": 0.4582677165354331, "grad_norm": 0.38896161112398014, "learning_rate": 4.70828471411902e-05, "loss": 0.4466, "step": 291 }, { "epoch": 0.45984251968503936, "grad_norm": 0.3571478190160688, "learning_rate": 4.70536756126021e-05, "loss": 0.4592, "step": 292 }, { "epoch": 0.46141732283464565, "grad_norm": 0.3437863949613371, "learning_rate": 4.7024504084014e-05, "loss": 0.4468, "step": 293 }, { "epoch": 0.462992125984252, "grad_norm": 0.41381248580071517, "learning_rate": 4.6995332555425905e-05, "loss": 0.4263, "step": 294 }, { "epoch": 0.4645669291338583, "grad_norm": 0.45206866915688826, "learning_rate": 4.696616102683781e-05, "loss": 0.4788, "step": 295 }, { "epoch": 0.46614173228346456, "grad_norm": 0.4604838885870295, "learning_rate": 4.6936989498249714e-05, "loss": 0.4695, "step": 296 }, { "epoch": 0.46771653543307085, "grad_norm": 0.4097085571980374, "learning_rate": 4.690781796966161e-05, "loss": 0.446, "step": 297 }, { "epoch": 0.4692913385826772, "grad_norm": 0.4922523990549993, "learning_rate": 4.6878646441073516e-05, "loss": 0.4781, "step": 298 }, { "epoch": 0.47086614173228347, "grad_norm": 0.4890015900962753, "learning_rate": 4.6849474912485414e-05, "loss": 0.4455, "step": 299 }, { "epoch": 0.47244094488188976, "grad_norm": 0.3868391645086783, "learning_rate": 4.682030338389732e-05, "loss": 0.4525, "step": 300 }, { "epoch": 0.47401574803149604, "grad_norm": 0.47224961609708466, "learning_rate": 4.679113185530922e-05, "loss": 0.4666, "step": 301 }, { "epoch": 0.4755905511811024, "grad_norm": 0.4583216689769597, "learning_rate": 4.676196032672112e-05, "loss": 0.4672, "step": 302 }, { "epoch": 0.47716535433070867, "grad_norm": 0.5298286365748922, "learning_rate": 4.6732788798133025e-05, "loss": 0.4527, "step": 303 }, { "epoch": 0.47874015748031495, "grad_norm": 0.3658194712960081, "learning_rate": 4.670361726954492e-05, "loss": 0.4443, "step": 304 }, { "epoch": 0.48031496062992124, "grad_norm": 0.4393148377237598, "learning_rate": 4.667444574095683e-05, "loss": 0.4627, "step": 305 }, { "epoch": 0.4818897637795276, "grad_norm": 0.4678347360101068, "learning_rate": 4.6645274212368725e-05, "loss": 0.4552, "step": 306 }, { "epoch": 0.48346456692913387, "grad_norm": 0.37813552830186076, "learning_rate": 4.661610268378063e-05, "loss": 0.4454, "step": 307 }, { "epoch": 0.48503937007874015, "grad_norm": 0.5322438763267271, "learning_rate": 4.6586931155192534e-05, "loss": 0.4585, "step": 308 }, { "epoch": 0.48661417322834644, "grad_norm": 0.3320089930609413, "learning_rate": 4.655775962660444e-05, "loss": 0.4553, "step": 309 }, { "epoch": 0.4881889763779528, "grad_norm": 0.4697213712112163, "learning_rate": 4.652858809801634e-05, "loss": 0.4637, "step": 310 }, { "epoch": 0.48976377952755906, "grad_norm": 0.381149939697693, "learning_rate": 4.649941656942824e-05, "loss": 0.4516, "step": 311 }, { "epoch": 0.49133858267716535, "grad_norm": 0.49443675161331446, "learning_rate": 4.6470245040840145e-05, "loss": 0.4513, "step": 312 }, { "epoch": 0.49291338582677163, "grad_norm": 0.46109871207116226, "learning_rate": 4.644107351225204e-05, "loss": 0.482, "step": 313 }, { "epoch": 0.494488188976378, "grad_norm": 0.4551075190822483, "learning_rate": 4.641190198366395e-05, "loss": 0.4443, "step": 314 }, { "epoch": 0.49606299212598426, "grad_norm": 0.32894435679371753, "learning_rate": 4.6382730455075845e-05, "loss": 0.4443, "step": 315 }, { "epoch": 0.49763779527559054, "grad_norm": 0.34851263048742204, "learning_rate": 4.635355892648775e-05, "loss": 0.4484, "step": 316 }, { "epoch": 0.49921259842519683, "grad_norm": 0.3458971801556171, "learning_rate": 4.6324387397899654e-05, "loss": 0.4528, "step": 317 }, { "epoch": 0.5007874015748032, "grad_norm": 0.33679835223142923, "learning_rate": 4.629521586931155e-05, "loss": 0.4465, "step": 318 }, { "epoch": 0.5023622047244094, "grad_norm": 0.358477219292855, "learning_rate": 4.6266044340723456e-05, "loss": 0.4381, "step": 319 }, { "epoch": 0.5039370078740157, "grad_norm": 0.36344436182874823, "learning_rate": 4.6236872812135354e-05, "loss": 0.4555, "step": 320 }, { "epoch": 0.5055118110236221, "grad_norm": 0.38980175288159863, "learning_rate": 4.620770128354726e-05, "loss": 0.4517, "step": 321 }, { "epoch": 0.5070866141732283, "grad_norm": 0.4744203692242079, "learning_rate": 4.617852975495916e-05, "loss": 0.4728, "step": 322 }, { "epoch": 0.5086614173228347, "grad_norm": 0.4191141247162168, "learning_rate": 4.614935822637107e-05, "loss": 0.4701, "step": 323 }, { "epoch": 0.510236220472441, "grad_norm": 0.5359966655113817, "learning_rate": 4.6120186697782965e-05, "loss": 0.4586, "step": 324 }, { "epoch": 0.5118110236220472, "grad_norm": 0.5037263044183322, "learning_rate": 4.609101516919487e-05, "loss": 0.46, "step": 325 }, { "epoch": 0.5133858267716536, "grad_norm": 0.48987001313342776, "learning_rate": 4.6061843640606774e-05, "loss": 0.4509, "step": 326 }, { "epoch": 0.5149606299212598, "grad_norm": 0.3840514799041956, "learning_rate": 4.603267211201867e-05, "loss": 0.4475, "step": 327 }, { "epoch": 0.5165354330708661, "grad_norm": 0.5408977221671465, "learning_rate": 4.6003500583430576e-05, "loss": 0.4487, "step": 328 }, { "epoch": 0.5181102362204725, "grad_norm": 0.36285276386418813, "learning_rate": 4.5974329054842474e-05, "loss": 0.4374, "step": 329 }, { "epoch": 0.5196850393700787, "grad_norm": 0.512763743491573, "learning_rate": 4.594515752625438e-05, "loss": 0.479, "step": 330 }, { "epoch": 0.521259842519685, "grad_norm": 0.42521208783417563, "learning_rate": 4.5915985997666276e-05, "loss": 0.4621, "step": 331 }, { "epoch": 0.5228346456692914, "grad_norm": 0.44277894907544607, "learning_rate": 4.588681446907818e-05, "loss": 0.4705, "step": 332 }, { "epoch": 0.5244094488188976, "grad_norm": 0.49566075769365825, "learning_rate": 4.5857642940490085e-05, "loss": 0.4445, "step": 333 }, { "epoch": 0.525984251968504, "grad_norm": 0.35511186143588597, "learning_rate": 4.582847141190198e-05, "loss": 0.461, "step": 334 }, { "epoch": 0.5275590551181102, "grad_norm": 0.564614331228573, "learning_rate": 4.579929988331389e-05, "loss": 0.4535, "step": 335 }, { "epoch": 0.5291338582677165, "grad_norm": 0.3902472461097566, "learning_rate": 4.577012835472579e-05, "loss": 0.4337, "step": 336 }, { "epoch": 0.5307086614173229, "grad_norm": 0.4108902540200116, "learning_rate": 4.5740956826137696e-05, "loss": 0.4558, "step": 337 }, { "epoch": 0.5322834645669291, "grad_norm": 0.48782352509390065, "learning_rate": 4.5711785297549593e-05, "loss": 0.4399, "step": 338 }, { "epoch": 0.5338582677165354, "grad_norm": 0.3657783752232081, "learning_rate": 4.56826137689615e-05, "loss": 0.4547, "step": 339 }, { "epoch": 0.5354330708661418, "grad_norm": 0.363752155088968, "learning_rate": 4.5653442240373396e-05, "loss": 0.4478, "step": 340 }, { "epoch": 0.537007874015748, "grad_norm": 0.3846524278732506, "learning_rate": 4.56242707117853e-05, "loss": 0.4417, "step": 341 }, { "epoch": 0.5385826771653544, "grad_norm": 0.41092655282873597, "learning_rate": 4.5595099183197205e-05, "loss": 0.451, "step": 342 }, { "epoch": 0.5401574803149606, "grad_norm": 0.3845140211913491, "learning_rate": 4.55659276546091e-05, "loss": 0.4428, "step": 343 }, { "epoch": 0.5417322834645669, "grad_norm": 0.36694954995128937, "learning_rate": 4.553675612602101e-05, "loss": 0.4466, "step": 344 }, { "epoch": 0.5433070866141733, "grad_norm": 0.382887698221884, "learning_rate": 4.5507584597432904e-05, "loss": 0.4381, "step": 345 }, { "epoch": 0.5448818897637795, "grad_norm": 0.41269585300752565, "learning_rate": 4.547841306884481e-05, "loss": 0.4657, "step": 346 }, { "epoch": 0.5464566929133858, "grad_norm": 0.4476319291072117, "learning_rate": 4.5449241540256707e-05, "loss": 0.4829, "step": 347 }, { "epoch": 0.5480314960629922, "grad_norm": 0.44448728029136325, "learning_rate": 4.542007001166861e-05, "loss": 0.4332, "step": 348 }, { "epoch": 0.5496062992125984, "grad_norm": 0.424283418877656, "learning_rate": 4.5390898483080515e-05, "loss": 0.4367, "step": 349 }, { "epoch": 0.5511811023622047, "grad_norm": 0.4369595075527717, "learning_rate": 4.536172695449242e-05, "loss": 0.4376, "step": 350 }, { "epoch": 0.552755905511811, "grad_norm": 0.4404306997617378, "learning_rate": 4.5332555425904324e-05, "loss": 0.456, "step": 351 }, { "epoch": 0.5543307086614173, "grad_norm": 0.4531618994960058, "learning_rate": 4.530338389731622e-05, "loss": 0.4174, "step": 352 }, { "epoch": 0.5559055118110237, "grad_norm": 0.40763960840839136, "learning_rate": 4.5274212368728127e-05, "loss": 0.451, "step": 353 }, { "epoch": 0.5574803149606299, "grad_norm": 0.4354389333874302, "learning_rate": 4.5245040840140024e-05, "loss": 0.4423, "step": 354 }, { "epoch": 0.5590551181102362, "grad_norm": 0.46717217037010095, "learning_rate": 4.521586931155193e-05, "loss": 0.4455, "step": 355 }, { "epoch": 0.5606299212598426, "grad_norm": 0.43196013290163465, "learning_rate": 4.5186697782963826e-05, "loss": 0.4418, "step": 356 }, { "epoch": 0.5622047244094488, "grad_norm": 0.4564132858164859, "learning_rate": 4.515752625437573e-05, "loss": 0.4619, "step": 357 }, { "epoch": 0.5637795275590551, "grad_norm": 0.4172842735772344, "learning_rate": 4.5128354725787635e-05, "loss": 0.4716, "step": 358 }, { "epoch": 0.5653543307086614, "grad_norm": 0.4698598888941704, "learning_rate": 4.509918319719953e-05, "loss": 0.4541, "step": 359 }, { "epoch": 0.5669291338582677, "grad_norm": 0.4363045758567553, "learning_rate": 4.507001166861144e-05, "loss": 0.453, "step": 360 }, { "epoch": 0.568503937007874, "grad_norm": 0.36058331032400576, "learning_rate": 4.5040840140023335e-05, "loss": 0.4542, "step": 361 }, { "epoch": 0.5700787401574803, "grad_norm": 0.38084202379417437, "learning_rate": 4.501166861143524e-05, "loss": 0.441, "step": 362 }, { "epoch": 0.5716535433070866, "grad_norm": 0.34704665073227137, "learning_rate": 4.4982497082847144e-05, "loss": 0.4328, "step": 363 }, { "epoch": 0.573228346456693, "grad_norm": 0.4285252631038537, "learning_rate": 4.495332555425905e-05, "loss": 0.4365, "step": 364 }, { "epoch": 0.5748031496062992, "grad_norm": 0.36542551599851625, "learning_rate": 4.4924154025670946e-05, "loss": 0.4415, "step": 365 }, { "epoch": 0.5763779527559055, "grad_norm": 0.4497897318977249, "learning_rate": 4.489498249708285e-05, "loss": 0.4507, "step": 366 }, { "epoch": 0.5779527559055118, "grad_norm": 0.3519814828943838, "learning_rate": 4.4865810968494755e-05, "loss": 0.4489, "step": 367 }, { "epoch": 0.5795275590551181, "grad_norm": 0.4739677344055051, "learning_rate": 4.483663943990665e-05, "loss": 0.454, "step": 368 }, { "epoch": 0.5811023622047244, "grad_norm": 0.4530840111259633, "learning_rate": 4.480746791131856e-05, "loss": 0.4384, "step": 369 }, { "epoch": 0.5826771653543307, "grad_norm": 0.37550357279982227, "learning_rate": 4.4778296382730455e-05, "loss": 0.4448, "step": 370 }, { "epoch": 0.584251968503937, "grad_norm": 0.42745558646830784, "learning_rate": 4.474912485414236e-05, "loss": 0.4343, "step": 371 }, { "epoch": 0.5858267716535434, "grad_norm": 0.394530159080798, "learning_rate": 4.471995332555426e-05, "loss": 0.4719, "step": 372 }, { "epoch": 0.5874015748031496, "grad_norm": 0.4506394663021526, "learning_rate": 4.469078179696616e-05, "loss": 0.4609, "step": 373 }, { "epoch": 0.5889763779527559, "grad_norm": 0.3738808171124792, "learning_rate": 4.466161026837806e-05, "loss": 0.4574, "step": 374 }, { "epoch": 0.5905511811023622, "grad_norm": 0.39028971552041486, "learning_rate": 4.4632438739789964e-05, "loss": 0.4473, "step": 375 }, { "epoch": 0.5921259842519685, "grad_norm": 0.3523922263651231, "learning_rate": 4.460326721120187e-05, "loss": 0.4598, "step": 376 }, { "epoch": 0.5937007874015748, "grad_norm": 0.36551663893725733, "learning_rate": 4.457409568261377e-05, "loss": 0.4528, "step": 377 }, { "epoch": 0.5952755905511811, "grad_norm": 0.33498715978763727, "learning_rate": 4.454492415402568e-05, "loss": 0.4403, "step": 378 }, { "epoch": 0.5968503937007874, "grad_norm": 0.3444315924910941, "learning_rate": 4.4515752625437575e-05, "loss": 0.4549, "step": 379 }, { "epoch": 0.5984251968503937, "grad_norm": 0.36356549542359673, "learning_rate": 4.448658109684948e-05, "loss": 0.4351, "step": 380 }, { "epoch": 0.6, "grad_norm": 0.36911580236636526, "learning_rate": 4.445740956826138e-05, "loss": 0.4636, "step": 381 }, { "epoch": 0.6015748031496063, "grad_norm": 0.3913070084841646, "learning_rate": 4.442823803967328e-05, "loss": 0.4343, "step": 382 }, { "epoch": 0.6031496062992125, "grad_norm": 0.4061850613741893, "learning_rate": 4.4399066511085186e-05, "loss": 0.4535, "step": 383 }, { "epoch": 0.6047244094488189, "grad_norm": 0.4035588078829638, "learning_rate": 4.4369894982497084e-05, "loss": 0.473, "step": 384 }, { "epoch": 0.6062992125984252, "grad_norm": 0.4553104858364246, "learning_rate": 4.434072345390899e-05, "loss": 0.425, "step": 385 }, { "epoch": 0.6078740157480315, "grad_norm": 0.40466228244179614, "learning_rate": 4.4311551925320886e-05, "loss": 0.4539, "step": 386 }, { "epoch": 0.6094488188976378, "grad_norm": 0.34950800484980993, "learning_rate": 4.428238039673279e-05, "loss": 0.4274, "step": 387 }, { "epoch": 0.6110236220472441, "grad_norm": 0.39406946999038717, "learning_rate": 4.425320886814469e-05, "loss": 0.4684, "step": 388 }, { "epoch": 0.6125984251968504, "grad_norm": 0.44427910469242776, "learning_rate": 4.422403733955659e-05, "loss": 0.4493, "step": 389 }, { "epoch": 0.6141732283464567, "grad_norm": 0.35082449531912907, "learning_rate": 4.41948658109685e-05, "loss": 0.4405, "step": 390 }, { "epoch": 0.6157480314960629, "grad_norm": 0.436672919293726, "learning_rate": 4.41656942823804e-05, "loss": 0.4486, "step": 391 }, { "epoch": 0.6173228346456693, "grad_norm": 0.4014579652844314, "learning_rate": 4.4136522753792306e-05, "loss": 0.4533, "step": 392 }, { "epoch": 0.6188976377952756, "grad_norm": 0.35561917255979364, "learning_rate": 4.4107351225204204e-05, "loss": 0.4291, "step": 393 }, { "epoch": 0.6204724409448819, "grad_norm": 0.40631952224819884, "learning_rate": 4.407817969661611e-05, "loss": 0.4149, "step": 394 }, { "epoch": 0.6220472440944882, "grad_norm": 0.3986633856469438, "learning_rate": 4.4049008168028006e-05, "loss": 0.4641, "step": 395 }, { "epoch": 0.6236220472440945, "grad_norm": 0.47998002671508955, "learning_rate": 4.401983663943991e-05, "loss": 0.4538, "step": 396 }, { "epoch": 0.6251968503937008, "grad_norm": 0.4354309238966227, "learning_rate": 4.399066511085181e-05, "loss": 0.4332, "step": 397 }, { "epoch": 0.6267716535433071, "grad_norm": 0.477476589337264, "learning_rate": 4.396149358226371e-05, "loss": 0.4547, "step": 398 }, { "epoch": 0.6283464566929133, "grad_norm": 0.4822689073390809, "learning_rate": 4.393232205367561e-05, "loss": 0.4459, "step": 399 }, { "epoch": 0.6299212598425197, "grad_norm": 0.3720959196022425, "learning_rate": 4.3903150525087515e-05, "loss": 0.4532, "step": 400 }, { "epoch": 0.631496062992126, "grad_norm": 0.4754853111905845, "learning_rate": 4.387397899649942e-05, "loss": 0.445, "step": 401 }, { "epoch": 0.6330708661417322, "grad_norm": 0.36969322328486875, "learning_rate": 4.384480746791132e-05, "loss": 0.4326, "step": 402 }, { "epoch": 0.6346456692913386, "grad_norm": 0.43646202675439383, "learning_rate": 4.381563593932322e-05, "loss": 0.4379, "step": 403 }, { "epoch": 0.6362204724409449, "grad_norm": 0.39590081475119115, "learning_rate": 4.3786464410735126e-05, "loss": 0.4456, "step": 404 }, { "epoch": 0.6377952755905512, "grad_norm": 0.4327624138029435, "learning_rate": 4.375729288214703e-05, "loss": 0.4628, "step": 405 }, { "epoch": 0.6393700787401575, "grad_norm": 0.4176607013386392, "learning_rate": 4.372812135355893e-05, "loss": 0.4146, "step": 406 }, { "epoch": 0.6409448818897637, "grad_norm": 0.3632772259086441, "learning_rate": 4.369894982497083e-05, "loss": 0.4486, "step": 407 }, { "epoch": 0.6425196850393701, "grad_norm": 0.4286152038666698, "learning_rate": 4.366977829638274e-05, "loss": 0.4489, "step": 408 }, { "epoch": 0.6440944881889764, "grad_norm": 0.3097431864562948, "learning_rate": 4.3640606767794635e-05, "loss": 0.4464, "step": 409 }, { "epoch": 0.6456692913385826, "grad_norm": 0.3977227545634238, "learning_rate": 4.361143523920654e-05, "loss": 0.4493, "step": 410 }, { "epoch": 0.647244094488189, "grad_norm": 0.39153781144016175, "learning_rate": 4.358226371061844e-05, "loss": 0.4257, "step": 411 }, { "epoch": 0.6488188976377953, "grad_norm": 0.45353107879622967, "learning_rate": 4.355309218203034e-05, "loss": 0.4249, "step": 412 }, { "epoch": 0.6503937007874016, "grad_norm": 0.34269046656872115, "learning_rate": 4.352392065344224e-05, "loss": 0.4455, "step": 413 }, { "epoch": 0.6519685039370079, "grad_norm": 0.4905968560151782, "learning_rate": 4.349474912485414e-05, "loss": 0.4322, "step": 414 }, { "epoch": 0.6535433070866141, "grad_norm": 0.3709307969442702, "learning_rate": 4.346557759626604e-05, "loss": 0.4498, "step": 415 }, { "epoch": 0.6551181102362205, "grad_norm": 0.43900430409217106, "learning_rate": 4.3436406067677945e-05, "loss": 0.4375, "step": 416 }, { "epoch": 0.6566929133858268, "grad_norm": 0.43398930990878437, "learning_rate": 4.340723453908985e-05, "loss": 0.4458, "step": 417 }, { "epoch": 0.658267716535433, "grad_norm": 0.34658002471634974, "learning_rate": 4.3378063010501754e-05, "loss": 0.437, "step": 418 }, { "epoch": 0.6598425196850394, "grad_norm": 0.4665949563668454, "learning_rate": 4.334889148191366e-05, "loss": 0.4543, "step": 419 }, { "epoch": 0.6614173228346457, "grad_norm": 0.3368652079929485, "learning_rate": 4.3319719953325557e-05, "loss": 0.4294, "step": 420 }, { "epoch": 0.662992125984252, "grad_norm": 0.4329294780674907, "learning_rate": 4.329054842473746e-05, "loss": 0.4337, "step": 421 }, { "epoch": 0.6645669291338583, "grad_norm": 0.399858529407598, "learning_rate": 4.326137689614936e-05, "loss": 0.4322, "step": 422 }, { "epoch": 0.6661417322834645, "grad_norm": 0.37414466443366634, "learning_rate": 4.323220536756126e-05, "loss": 0.4358, "step": 423 }, { "epoch": 0.6677165354330709, "grad_norm": 0.48095826123372953, "learning_rate": 4.320303383897317e-05, "loss": 0.4464, "step": 424 }, { "epoch": 0.6692913385826772, "grad_norm": 0.3762250594560389, "learning_rate": 4.3173862310385065e-05, "loss": 0.4448, "step": 425 }, { "epoch": 0.6708661417322834, "grad_norm": 0.39117977052885594, "learning_rate": 4.314469078179697e-05, "loss": 0.4362, "step": 426 }, { "epoch": 0.6724409448818898, "grad_norm": 0.4242242784445451, "learning_rate": 4.311551925320887e-05, "loss": 0.4352, "step": 427 }, { "epoch": 0.6740157480314961, "grad_norm": 0.3324605502111233, "learning_rate": 4.308634772462077e-05, "loss": 0.4411, "step": 428 }, { "epoch": 0.6755905511811023, "grad_norm": 0.4561837587735267, "learning_rate": 4.305717619603267e-05, "loss": 0.4403, "step": 429 }, { "epoch": 0.6771653543307087, "grad_norm": 0.3142070949494872, "learning_rate": 4.3028004667444574e-05, "loss": 0.4403, "step": 430 }, { "epoch": 0.6787401574803149, "grad_norm": 0.5083683853569048, "learning_rate": 4.299883313885648e-05, "loss": 0.4238, "step": 431 }, { "epoch": 0.6803149606299213, "grad_norm": 0.34214836106340396, "learning_rate": 4.296966161026838e-05, "loss": 0.4276, "step": 432 }, { "epoch": 0.6818897637795276, "grad_norm": 0.49842966742547523, "learning_rate": 4.294049008168029e-05, "loss": 0.4417, "step": 433 }, { "epoch": 0.6834645669291338, "grad_norm": 0.32368142174482056, "learning_rate": 4.2911318553092185e-05, "loss": 0.4311, "step": 434 }, { "epoch": 0.6850393700787402, "grad_norm": 0.433311780059108, "learning_rate": 4.288214702450409e-05, "loss": 0.4415, "step": 435 }, { "epoch": 0.6866141732283465, "grad_norm": 0.3986294386305301, "learning_rate": 4.285297549591599e-05, "loss": 0.432, "step": 436 }, { "epoch": 0.6881889763779527, "grad_norm": 0.3596891735118988, "learning_rate": 4.282380396732789e-05, "loss": 0.435, "step": 437 }, { "epoch": 0.6897637795275591, "grad_norm": 0.3696738386347177, "learning_rate": 4.279463243873979e-05, "loss": 0.4388, "step": 438 }, { "epoch": 0.6913385826771653, "grad_norm": 0.35089200627081935, "learning_rate": 4.2765460910151694e-05, "loss": 0.4343, "step": 439 }, { "epoch": 0.6929133858267716, "grad_norm": 0.38153936451015585, "learning_rate": 4.273628938156359e-05, "loss": 0.4354, "step": 440 }, { "epoch": 0.694488188976378, "grad_norm": 0.3184577483564101, "learning_rate": 4.2707117852975496e-05, "loss": 0.4523, "step": 441 }, { "epoch": 0.6960629921259842, "grad_norm": 0.4285864163305924, "learning_rate": 4.26779463243874e-05, "loss": 0.4396, "step": 442 }, { "epoch": 0.6976377952755906, "grad_norm": 0.3376960017497503, "learning_rate": 4.26487747957993e-05, "loss": 0.4408, "step": 443 }, { "epoch": 0.6992125984251969, "grad_norm": 0.36476607245019155, "learning_rate": 4.26196032672112e-05, "loss": 0.4365, "step": 444 }, { "epoch": 0.7007874015748031, "grad_norm": 0.3898320595108796, "learning_rate": 4.259043173862311e-05, "loss": 0.4187, "step": 445 }, { "epoch": 0.7023622047244095, "grad_norm": 0.3546924316214882, "learning_rate": 4.256126021003501e-05, "loss": 0.4401, "step": 446 }, { "epoch": 0.7039370078740157, "grad_norm": 0.38534173154144685, "learning_rate": 4.253208868144691e-05, "loss": 0.4443, "step": 447 }, { "epoch": 0.705511811023622, "grad_norm": 0.3758538205108525, "learning_rate": 4.2502917152858814e-05, "loss": 0.4321, "step": 448 }, { "epoch": 0.7070866141732284, "grad_norm": 0.4096199148352505, "learning_rate": 4.247374562427072e-05, "loss": 0.458, "step": 449 }, { "epoch": 0.7086614173228346, "grad_norm": 0.38116978381128874, "learning_rate": 4.2444574095682616e-05, "loss": 0.4456, "step": 450 }, { "epoch": 0.710236220472441, "grad_norm": 0.32826572269552556, "learning_rate": 4.241540256709452e-05, "loss": 0.4381, "step": 451 }, { "epoch": 0.7118110236220473, "grad_norm": 0.39353786876834396, "learning_rate": 4.238623103850642e-05, "loss": 0.4458, "step": 452 }, { "epoch": 0.7133858267716535, "grad_norm": 0.3544773589512462, "learning_rate": 4.235705950991832e-05, "loss": 0.4303, "step": 453 }, { "epoch": 0.7149606299212599, "grad_norm": 0.3119588190701197, "learning_rate": 4.232788798133022e-05, "loss": 0.4427, "step": 454 }, { "epoch": 0.7165354330708661, "grad_norm": 0.40129309396842405, "learning_rate": 4.2298716452742125e-05, "loss": 0.4441, "step": 455 }, { "epoch": 0.7181102362204724, "grad_norm": 0.3275697389173181, "learning_rate": 4.226954492415402e-05, "loss": 0.4325, "step": 456 }, { "epoch": 0.7196850393700788, "grad_norm": 0.29430236040688446, "learning_rate": 4.224037339556593e-05, "loss": 0.4182, "step": 457 }, { "epoch": 0.721259842519685, "grad_norm": 0.39594225414707285, "learning_rate": 4.221120186697783e-05, "loss": 0.4292, "step": 458 }, { "epoch": 0.7228346456692913, "grad_norm": 0.34291379996811355, "learning_rate": 4.2182030338389736e-05, "loss": 0.4621, "step": 459 }, { "epoch": 0.7244094488188977, "grad_norm": 0.44736955428510233, "learning_rate": 4.215285880980164e-05, "loss": 0.4394, "step": 460 }, { "epoch": 0.7259842519685039, "grad_norm": 0.40538615767496494, "learning_rate": 4.212368728121354e-05, "loss": 0.4389, "step": 461 }, { "epoch": 0.7275590551181103, "grad_norm": 0.34559994398763266, "learning_rate": 4.209451575262544e-05, "loss": 0.4069, "step": 462 }, { "epoch": 0.7291338582677165, "grad_norm": 0.4000836390089101, "learning_rate": 4.206534422403734e-05, "loss": 0.4464, "step": 463 }, { "epoch": 0.7307086614173228, "grad_norm": 0.3830422102185416, "learning_rate": 4.2036172695449245e-05, "loss": 0.4397, "step": 464 }, { "epoch": 0.7322834645669292, "grad_norm": 0.5426628698377676, "learning_rate": 4.200700116686114e-05, "loss": 0.4481, "step": 465 }, { "epoch": 0.7338582677165354, "grad_norm": 0.4384209645657967, "learning_rate": 4.197782963827305e-05, "loss": 0.4707, "step": 466 }, { "epoch": 0.7354330708661417, "grad_norm": 0.3947082835985258, "learning_rate": 4.194865810968495e-05, "loss": 0.4455, "step": 467 }, { "epoch": 0.7370078740157481, "grad_norm": 0.44390793220935126, "learning_rate": 4.191948658109685e-05, "loss": 0.446, "step": 468 }, { "epoch": 0.7385826771653543, "grad_norm": 0.4192565181290371, "learning_rate": 4.1890315052508754e-05, "loss": 0.4395, "step": 469 }, { "epoch": 0.7401574803149606, "grad_norm": 0.38217422345844526, "learning_rate": 4.186114352392065e-05, "loss": 0.4618, "step": 470 }, { "epoch": 0.7417322834645669, "grad_norm": 0.39734314982492325, "learning_rate": 4.1831971995332556e-05, "loss": 0.4384, "step": 471 }, { "epoch": 0.7433070866141732, "grad_norm": 0.39354182415686145, "learning_rate": 4.180280046674446e-05, "loss": 0.4583, "step": 472 }, { "epoch": 0.7448818897637796, "grad_norm": 0.3593658341589065, "learning_rate": 4.1773628938156365e-05, "loss": 0.4348, "step": 473 }, { "epoch": 0.7464566929133858, "grad_norm": 0.3773967273777185, "learning_rate": 4.174445740956827e-05, "loss": 0.4666, "step": 474 }, { "epoch": 0.7480314960629921, "grad_norm": 0.42579274946006185, "learning_rate": 4.171528588098017e-05, "loss": 0.4402, "step": 475 }, { "epoch": 0.7496062992125985, "grad_norm": 0.33035039261692634, "learning_rate": 4.168611435239207e-05, "loss": 0.4156, "step": 476 }, { "epoch": 0.7511811023622047, "grad_norm": 0.4041537658821666, "learning_rate": 4.165694282380397e-05, "loss": 0.442, "step": 477 }, { "epoch": 0.752755905511811, "grad_norm": 0.381663423423362, "learning_rate": 4.1627771295215873e-05, "loss": 0.434, "step": 478 }, { "epoch": 0.7543307086614173, "grad_norm": 0.32554864874090805, "learning_rate": 4.159859976662777e-05, "loss": 0.4222, "step": 479 }, { "epoch": 0.7559055118110236, "grad_norm": 0.351391709543344, "learning_rate": 4.1569428238039676e-05, "loss": 0.4342, "step": 480 }, { "epoch": 0.75748031496063, "grad_norm": 0.37636132163809755, "learning_rate": 4.154025670945157e-05, "loss": 0.4461, "step": 481 }, { "epoch": 0.7590551181102362, "grad_norm": 0.418680578449023, "learning_rate": 4.151108518086348e-05, "loss": 0.4437, "step": 482 }, { "epoch": 0.7606299212598425, "grad_norm": 0.5336068689096665, "learning_rate": 4.148191365227538e-05, "loss": 0.4309, "step": 483 }, { "epoch": 0.7622047244094489, "grad_norm": 0.35799467327393564, "learning_rate": 4.145274212368728e-05, "loss": 0.4645, "step": 484 }, { "epoch": 0.7637795275590551, "grad_norm": 0.48149472902582574, "learning_rate": 4.1423570595099184e-05, "loss": 0.4542, "step": 485 }, { "epoch": 0.7653543307086614, "grad_norm": 0.3676937600426536, "learning_rate": 4.139439906651109e-05, "loss": 0.4367, "step": 486 }, { "epoch": 0.7669291338582677, "grad_norm": 0.4339634767516138, "learning_rate": 4.136522753792299e-05, "loss": 0.4319, "step": 487 }, { "epoch": 0.768503937007874, "grad_norm": 0.3960040349621235, "learning_rate": 4.133605600933489e-05, "loss": 0.4206, "step": 488 }, { "epoch": 0.7700787401574803, "grad_norm": 0.46509393900228596, "learning_rate": 4.1306884480746796e-05, "loss": 0.4355, "step": 489 }, { "epoch": 0.7716535433070866, "grad_norm": 0.5266373330898565, "learning_rate": 4.127771295215869e-05, "loss": 0.4568, "step": 490 }, { "epoch": 0.7732283464566929, "grad_norm": 0.5528102738379786, "learning_rate": 4.12485414235706e-05, "loss": 0.429, "step": 491 }, { "epoch": 0.7748031496062993, "grad_norm": 0.45957668694276876, "learning_rate": 4.12193698949825e-05, "loss": 0.4444, "step": 492 }, { "epoch": 0.7763779527559055, "grad_norm": 0.5783879990368149, "learning_rate": 4.11901983663944e-05, "loss": 0.4424, "step": 493 }, { "epoch": 0.7779527559055118, "grad_norm": 0.42502555008340864, "learning_rate": 4.1161026837806304e-05, "loss": 0.431, "step": 494 }, { "epoch": 0.7795275590551181, "grad_norm": 0.47775659251686964, "learning_rate": 4.11318553092182e-05, "loss": 0.4403, "step": 495 }, { "epoch": 0.7811023622047244, "grad_norm": 0.5309179789376768, "learning_rate": 4.1102683780630107e-05, "loss": 0.439, "step": 496 }, { "epoch": 0.7826771653543307, "grad_norm": 0.43325120902737974, "learning_rate": 4.1073512252042004e-05, "loss": 0.422, "step": 497 }, { "epoch": 0.784251968503937, "grad_norm": 0.5532804248062936, "learning_rate": 4.104434072345391e-05, "loss": 0.4287, "step": 498 }, { "epoch": 0.7858267716535433, "grad_norm": 0.38669773432188537, "learning_rate": 4.101516919486581e-05, "loss": 0.4499, "step": 499 }, { "epoch": 0.7874015748031497, "grad_norm": 0.5475483456393899, "learning_rate": 4.098599766627772e-05, "loss": 0.4338, "step": 500 }, { "epoch": 0.7889763779527559, "grad_norm": 0.41099239698132234, "learning_rate": 4.095682613768962e-05, "loss": 0.4137, "step": 501 }, { "epoch": 0.7905511811023622, "grad_norm": 0.6057421327376746, "learning_rate": 4.092765460910152e-05, "loss": 0.4429, "step": 502 }, { "epoch": 0.7921259842519685, "grad_norm": 0.41232550042553406, "learning_rate": 4.0898483080513424e-05, "loss": 0.4517, "step": 503 }, { "epoch": 0.7937007874015748, "grad_norm": 0.6048141401985686, "learning_rate": 4.086931155192532e-05, "loss": 0.4364, "step": 504 }, { "epoch": 0.7952755905511811, "grad_norm": 0.5004002769845768, "learning_rate": 4.0840140023337226e-05, "loss": 0.4444, "step": 505 }, { "epoch": 0.7968503937007874, "grad_norm": 0.44944328940629635, "learning_rate": 4.0810968494749124e-05, "loss": 0.424, "step": 506 }, { "epoch": 0.7984251968503937, "grad_norm": 0.5059301931000612, "learning_rate": 4.078179696616103e-05, "loss": 0.4359, "step": 507 }, { "epoch": 0.8, "grad_norm": 0.36620246222188785, "learning_rate": 4.075262543757293e-05, "loss": 0.427, "step": 508 }, { "epoch": 0.8015748031496063, "grad_norm": 0.521160460111598, "learning_rate": 4.072345390898483e-05, "loss": 0.4423, "step": 509 }, { "epoch": 0.8031496062992126, "grad_norm": 0.3852233709618666, "learning_rate": 4.0694282380396735e-05, "loss": 0.4176, "step": 510 }, { "epoch": 0.8047244094488188, "grad_norm": 0.4758281016268729, "learning_rate": 4.066511085180863e-05, "loss": 0.4303, "step": 511 }, { "epoch": 0.8062992125984252, "grad_norm": 0.33594124301977196, "learning_rate": 4.063593932322054e-05, "loss": 0.4317, "step": 512 }, { "epoch": 0.8078740157480315, "grad_norm": 0.41693602842828004, "learning_rate": 4.060676779463244e-05, "loss": 0.4219, "step": 513 }, { "epoch": 0.8094488188976378, "grad_norm": 0.3377374304001947, "learning_rate": 4.0577596266044346e-05, "loss": 0.4269, "step": 514 }, { "epoch": 0.8110236220472441, "grad_norm": 0.4489294034178158, "learning_rate": 4.0548424737456244e-05, "loss": 0.4209, "step": 515 }, { "epoch": 0.8125984251968504, "grad_norm": 0.4448564981672704, "learning_rate": 4.051925320886815e-05, "loss": 0.4439, "step": 516 }, { "epoch": 0.8141732283464567, "grad_norm": 0.3596130847364335, "learning_rate": 4.049008168028005e-05, "loss": 0.4427, "step": 517 }, { "epoch": 0.815748031496063, "grad_norm": 0.3707689094003974, "learning_rate": 4.046091015169195e-05, "loss": 0.4303, "step": 518 }, { "epoch": 0.8173228346456692, "grad_norm": 0.4444875910750691, "learning_rate": 4.0431738623103855e-05, "loss": 0.448, "step": 519 }, { "epoch": 0.8188976377952756, "grad_norm": 0.4551026842961804, "learning_rate": 4.040256709451575e-05, "loss": 0.4319, "step": 520 }, { "epoch": 0.8204724409448819, "grad_norm": 0.4257027282201018, "learning_rate": 4.037339556592766e-05, "loss": 0.4482, "step": 521 }, { "epoch": 0.8220472440944881, "grad_norm": 0.5612042394612708, "learning_rate": 4.0344224037339555e-05, "loss": 0.4412, "step": 522 }, { "epoch": 0.8236220472440945, "grad_norm": 0.43969258412232204, "learning_rate": 4.031505250875146e-05, "loss": 0.4278, "step": 523 }, { "epoch": 0.8251968503937008, "grad_norm": 0.3388141568938749, "learning_rate": 4.0285880980163364e-05, "loss": 0.4267, "step": 524 }, { "epoch": 0.8267716535433071, "grad_norm": 0.5670207037716488, "learning_rate": 4.025670945157526e-05, "loss": 0.4242, "step": 525 }, { "epoch": 0.8283464566929134, "grad_norm": 0.31789064430652975, "learning_rate": 4.0227537922987166e-05, "loss": 0.4405, "step": 526 }, { "epoch": 0.8299212598425196, "grad_norm": 0.43062321841434104, "learning_rate": 4.019836639439907e-05, "loss": 0.4379, "step": 527 }, { "epoch": 0.831496062992126, "grad_norm": 0.346503899054319, "learning_rate": 4.0169194865810975e-05, "loss": 0.4216, "step": 528 }, { "epoch": 0.8330708661417323, "grad_norm": 0.4174123244237228, "learning_rate": 4.014002333722287e-05, "loss": 0.4472, "step": 529 }, { "epoch": 0.8346456692913385, "grad_norm": 0.34888670379159425, "learning_rate": 4.011085180863478e-05, "loss": 0.4385, "step": 530 }, { "epoch": 0.8362204724409449, "grad_norm": 0.4444704489937824, "learning_rate": 4.0081680280046675e-05, "loss": 0.4304, "step": 531 }, { "epoch": 0.8377952755905512, "grad_norm": 0.35827641961952994, "learning_rate": 4.005250875145858e-05, "loss": 0.4362, "step": 532 }, { "epoch": 0.8393700787401575, "grad_norm": 0.3264453941897347, "learning_rate": 4.0023337222870484e-05, "loss": 0.447, "step": 533 }, { "epoch": 0.8409448818897638, "grad_norm": 0.34207281247697074, "learning_rate": 3.999416569428238e-05, "loss": 0.4213, "step": 534 }, { "epoch": 0.84251968503937, "grad_norm": 0.36075212887147534, "learning_rate": 3.9964994165694286e-05, "loss": 0.4079, "step": 535 }, { "epoch": 0.8440944881889764, "grad_norm": 0.325046016400371, "learning_rate": 3.9935822637106184e-05, "loss": 0.4315, "step": 536 }, { "epoch": 0.8456692913385827, "grad_norm": 0.34172891637244207, "learning_rate": 3.990665110851809e-05, "loss": 0.418, "step": 537 }, { "epoch": 0.8472440944881889, "grad_norm": 0.3698008878443075, "learning_rate": 3.9877479579929986e-05, "loss": 0.439, "step": 538 }, { "epoch": 0.8488188976377953, "grad_norm": 0.34002077095535793, "learning_rate": 3.984830805134189e-05, "loss": 0.4301, "step": 539 }, { "epoch": 0.8503937007874016, "grad_norm": 0.38718841854182445, "learning_rate": 3.9819136522753795e-05, "loss": 0.423, "step": 540 }, { "epoch": 0.8519685039370078, "grad_norm": 0.34918205228975235, "learning_rate": 3.97899649941657e-05, "loss": 0.4347, "step": 541 }, { "epoch": 0.8535433070866142, "grad_norm": 0.49182979219117356, "learning_rate": 3.9760793465577604e-05, "loss": 0.4386, "step": 542 }, { "epoch": 0.8551181102362204, "grad_norm": 0.4324668695250364, "learning_rate": 3.97316219369895e-05, "loss": 0.414, "step": 543 }, { "epoch": 0.8566929133858268, "grad_norm": 0.4059665598729686, "learning_rate": 3.9702450408401406e-05, "loss": 0.4317, "step": 544 }, { "epoch": 0.8582677165354331, "grad_norm": 0.4648408980245247, "learning_rate": 3.9673278879813304e-05, "loss": 0.423, "step": 545 }, { "epoch": 0.8598425196850393, "grad_norm": 0.4306675737481622, "learning_rate": 3.964410735122521e-05, "loss": 0.4169, "step": 546 }, { "epoch": 0.8614173228346457, "grad_norm": 0.38120498206556686, "learning_rate": 3.9614935822637106e-05, "loss": 0.4372, "step": 547 }, { "epoch": 0.862992125984252, "grad_norm": 0.4378249707343946, "learning_rate": 3.958576429404901e-05, "loss": 0.4337, "step": 548 }, { "epoch": 0.8645669291338582, "grad_norm": 0.42125999640370415, "learning_rate": 3.9556592765460915e-05, "loss": 0.4154, "step": 549 }, { "epoch": 0.8661417322834646, "grad_norm": 0.43697160530402696, "learning_rate": 3.952742123687281e-05, "loss": 0.411, "step": 550 }, { "epoch": 0.8677165354330708, "grad_norm": 0.5072578612605366, "learning_rate": 3.949824970828472e-05, "loss": 0.4282, "step": 551 }, { "epoch": 0.8692913385826772, "grad_norm": 0.32394937038533583, "learning_rate": 3.9469078179696614e-05, "loss": 0.4317, "step": 552 }, { "epoch": 0.8708661417322835, "grad_norm": 0.4545452599889171, "learning_rate": 3.943990665110852e-05, "loss": 0.4308, "step": 553 }, { "epoch": 0.8724409448818897, "grad_norm": 0.36867105149902496, "learning_rate": 3.9410735122520423e-05, "loss": 0.4201, "step": 554 }, { "epoch": 0.8740157480314961, "grad_norm": 0.32916722213060984, "learning_rate": 3.938156359393233e-05, "loss": 0.4373, "step": 555 }, { "epoch": 0.8755905511811024, "grad_norm": 0.32802359950754617, "learning_rate": 3.9352392065344226e-05, "loss": 0.4269, "step": 556 }, { "epoch": 0.8771653543307086, "grad_norm": 0.3483666407386192, "learning_rate": 3.932322053675613e-05, "loss": 0.4335, "step": 557 }, { "epoch": 0.878740157480315, "grad_norm": 0.29698128898982934, "learning_rate": 3.9294049008168035e-05, "loss": 0.439, "step": 558 }, { "epoch": 0.8803149606299212, "grad_norm": 0.34742960069725093, "learning_rate": 3.926487747957993e-05, "loss": 0.4138, "step": 559 }, { "epoch": 0.8818897637795275, "grad_norm": 0.3363712586846115, "learning_rate": 3.923570595099184e-05, "loss": 0.4296, "step": 560 }, { "epoch": 0.8834645669291339, "grad_norm": 0.35589884528031246, "learning_rate": 3.9206534422403734e-05, "loss": 0.4154, "step": 561 }, { "epoch": 0.8850393700787401, "grad_norm": 0.32198523556202474, "learning_rate": 3.917736289381564e-05, "loss": 0.431, "step": 562 }, { "epoch": 0.8866141732283465, "grad_norm": 0.315822032274971, "learning_rate": 3.9148191365227537e-05, "loss": 0.4116, "step": 563 }, { "epoch": 0.8881889763779528, "grad_norm": 0.32361252790412454, "learning_rate": 3.911901983663944e-05, "loss": 0.4282, "step": 564 }, { "epoch": 0.889763779527559, "grad_norm": 0.31778937494832177, "learning_rate": 3.908984830805134e-05, "loss": 0.4414, "step": 565 }, { "epoch": 0.8913385826771654, "grad_norm": 0.3330859573786503, "learning_rate": 3.906067677946324e-05, "loss": 0.4421, "step": 566 }, { "epoch": 0.8929133858267716, "grad_norm": 0.328998050447561, "learning_rate": 3.903150525087515e-05, "loss": 0.4469, "step": 567 }, { "epoch": 0.8944881889763779, "grad_norm": 0.3690094871053156, "learning_rate": 3.900233372228705e-05, "loss": 0.4269, "step": 568 }, { "epoch": 0.8960629921259843, "grad_norm": 0.3721067159467623, "learning_rate": 3.8973162193698957e-05, "loss": 0.4346, "step": 569 }, { "epoch": 0.8976377952755905, "grad_norm": 0.3929673235299099, "learning_rate": 3.8943990665110854e-05, "loss": 0.4348, "step": 570 }, { "epoch": 0.8992125984251969, "grad_norm": 0.4042219621748363, "learning_rate": 3.891481913652276e-05, "loss": 0.4515, "step": 571 }, { "epoch": 0.9007874015748032, "grad_norm": 0.33173907183380974, "learning_rate": 3.8885647607934656e-05, "loss": 0.4422, "step": 572 }, { "epoch": 0.9023622047244094, "grad_norm": 0.34141384186086854, "learning_rate": 3.885647607934656e-05, "loss": 0.4135, "step": 573 }, { "epoch": 0.9039370078740158, "grad_norm": 0.31779306572056143, "learning_rate": 3.8827304550758465e-05, "loss": 0.4234, "step": 574 }, { "epoch": 0.905511811023622, "grad_norm": 0.41486708545032225, "learning_rate": 3.879813302217036e-05, "loss": 0.4185, "step": 575 }, { "epoch": 0.9070866141732283, "grad_norm": 0.3495859759583976, "learning_rate": 3.876896149358227e-05, "loss": 0.4233, "step": 576 }, { "epoch": 0.9086614173228347, "grad_norm": 0.39260913117142976, "learning_rate": 3.8739789964994165e-05, "loss": 0.4222, "step": 577 }, { "epoch": 0.9102362204724409, "grad_norm": 0.3210826241145465, "learning_rate": 3.871061843640607e-05, "loss": 0.4022, "step": 578 }, { "epoch": 0.9118110236220472, "grad_norm": 0.39600301737223204, "learning_rate": 3.868144690781797e-05, "loss": 0.4309, "step": 579 }, { "epoch": 0.9133858267716536, "grad_norm": 0.36236793473263024, "learning_rate": 3.865227537922987e-05, "loss": 0.4326, "step": 580 }, { "epoch": 0.9149606299212598, "grad_norm": 0.4443531640646438, "learning_rate": 3.8623103850641776e-05, "loss": 0.4426, "step": 581 }, { "epoch": 0.9165354330708662, "grad_norm": 0.3633837334596617, "learning_rate": 3.859393232205368e-05, "loss": 0.418, "step": 582 }, { "epoch": 0.9181102362204724, "grad_norm": 0.3760170937543012, "learning_rate": 3.8564760793465585e-05, "loss": 0.4306, "step": 583 }, { "epoch": 0.9196850393700787, "grad_norm": 0.3830147206929489, "learning_rate": 3.853558926487748e-05, "loss": 0.4315, "step": 584 }, { "epoch": 0.9212598425196851, "grad_norm": 0.3925260150980857, "learning_rate": 3.850641773628939e-05, "loss": 0.4251, "step": 585 }, { "epoch": 0.9228346456692913, "grad_norm": 0.341802082315545, "learning_rate": 3.8477246207701285e-05, "loss": 0.4239, "step": 586 }, { "epoch": 0.9244094488188976, "grad_norm": 0.37356287815775857, "learning_rate": 3.844807467911319e-05, "loss": 0.4339, "step": 587 }, { "epoch": 0.925984251968504, "grad_norm": 0.33465495845802323, "learning_rate": 3.841890315052509e-05, "loss": 0.4105, "step": 588 }, { "epoch": 0.9275590551181102, "grad_norm": 0.3372017936637061, "learning_rate": 3.838973162193699e-05, "loss": 0.4058, "step": 589 }, { "epoch": 0.9291338582677166, "grad_norm": 0.36260222026038796, "learning_rate": 3.836056009334889e-05, "loss": 0.4227, "step": 590 }, { "epoch": 0.9307086614173228, "grad_norm": 0.36752057157351165, "learning_rate": 3.8331388564760794e-05, "loss": 0.4493, "step": 591 }, { "epoch": 0.9322834645669291, "grad_norm": 0.3544686159669946, "learning_rate": 3.83022170361727e-05, "loss": 0.4183, "step": 592 }, { "epoch": 0.9338582677165355, "grad_norm": 0.44743234002659926, "learning_rate": 3.8273045507584596e-05, "loss": 0.4408, "step": 593 }, { "epoch": 0.9354330708661417, "grad_norm": 0.4163687698038925, "learning_rate": 3.82438739789965e-05, "loss": 0.443, "step": 594 }, { "epoch": 0.937007874015748, "grad_norm": 0.4106942486410524, "learning_rate": 3.8214702450408405e-05, "loss": 0.4371, "step": 595 }, { "epoch": 0.9385826771653544, "grad_norm": 0.3891851204195998, "learning_rate": 3.818553092182031e-05, "loss": 0.4253, "step": 596 }, { "epoch": 0.9401574803149606, "grad_norm": 0.34471373714778775, "learning_rate": 3.815635939323221e-05, "loss": 0.423, "step": 597 }, { "epoch": 0.9417322834645669, "grad_norm": 0.4378838600849905, "learning_rate": 3.812718786464411e-05, "loss": 0.4369, "step": 598 }, { "epoch": 0.9433070866141732, "grad_norm": 0.30501660868329944, "learning_rate": 3.8098016336056016e-05, "loss": 0.4144, "step": 599 }, { "epoch": 0.9448818897637795, "grad_norm": 0.4337597115373803, "learning_rate": 3.8068844807467914e-05, "loss": 0.4312, "step": 600 }, { "epoch": 0.9464566929133859, "grad_norm": 0.37099537909656466, "learning_rate": 3.803967327887982e-05, "loss": 0.4548, "step": 601 }, { "epoch": 0.9480314960629921, "grad_norm": 0.40217520843641935, "learning_rate": 3.8010501750291716e-05, "loss": 0.4173, "step": 602 }, { "epoch": 0.9496062992125984, "grad_norm": 0.39169589586861214, "learning_rate": 3.798133022170362e-05, "loss": 0.4347, "step": 603 }, { "epoch": 0.9511811023622048, "grad_norm": 0.29848323499106016, "learning_rate": 3.795215869311552e-05, "loss": 0.4147, "step": 604 }, { "epoch": 0.952755905511811, "grad_norm": 0.46498656000113325, "learning_rate": 3.792298716452742e-05, "loss": 0.4272, "step": 605 }, { "epoch": 0.9543307086614173, "grad_norm": 0.3273577561351757, "learning_rate": 3.789381563593932e-05, "loss": 0.42, "step": 606 }, { "epoch": 0.9559055118110236, "grad_norm": 0.3377254258732802, "learning_rate": 3.7864644107351225e-05, "loss": 0.4299, "step": 607 }, { "epoch": 0.9574803149606299, "grad_norm": 0.417032858985177, "learning_rate": 3.783547257876313e-05, "loss": 0.4312, "step": 608 }, { "epoch": 0.9590551181102362, "grad_norm": 0.34470705122522627, "learning_rate": 3.7806301050175034e-05, "loss": 0.4227, "step": 609 }, { "epoch": 0.9606299212598425, "grad_norm": 0.41349664969077404, "learning_rate": 3.777712952158693e-05, "loss": 0.4434, "step": 610 }, { "epoch": 0.9622047244094488, "grad_norm": 0.3575436540857483, "learning_rate": 3.7747957992998836e-05, "loss": 0.4101, "step": 611 }, { "epoch": 0.9637795275590552, "grad_norm": 0.3638658438333242, "learning_rate": 3.771878646441074e-05, "loss": 0.4448, "step": 612 }, { "epoch": 0.9653543307086614, "grad_norm": 0.4383235092959359, "learning_rate": 3.768961493582264e-05, "loss": 0.4288, "step": 613 }, { "epoch": 0.9669291338582677, "grad_norm": 0.3282728462099762, "learning_rate": 3.766044340723454e-05, "loss": 0.449, "step": 614 }, { "epoch": 0.968503937007874, "grad_norm": 0.41326122108454866, "learning_rate": 3.763127187864644e-05, "loss": 0.4567, "step": 615 }, { "epoch": 0.9700787401574803, "grad_norm": 0.3987383502145384, "learning_rate": 3.7602100350058345e-05, "loss": 0.4215, "step": 616 }, { "epoch": 0.9716535433070866, "grad_norm": 0.32382070292989457, "learning_rate": 3.757292882147025e-05, "loss": 0.43, "step": 617 }, { "epoch": 0.9732283464566929, "grad_norm": 0.44643278688341204, "learning_rate": 3.754375729288215e-05, "loss": 0.4432, "step": 618 }, { "epoch": 0.9748031496062992, "grad_norm": 0.31901358025833415, "learning_rate": 3.751458576429405e-05, "loss": 0.413, "step": 619 }, { "epoch": 0.9763779527559056, "grad_norm": 0.3671334550313984, "learning_rate": 3.748541423570595e-05, "loss": 0.4245, "step": 620 }, { "epoch": 0.9779527559055118, "grad_norm": 0.41579026693059556, "learning_rate": 3.7456242707117853e-05, "loss": 0.4268, "step": 621 }, { "epoch": 0.9795275590551181, "grad_norm": 0.3452699126156607, "learning_rate": 3.742707117852976e-05, "loss": 0.4388, "step": 622 }, { "epoch": 0.9811023622047244, "grad_norm": 0.3109636975657417, "learning_rate": 3.7397899649941656e-05, "loss": 0.405, "step": 623 }, { "epoch": 0.9826771653543307, "grad_norm": 0.3434730583991853, "learning_rate": 3.736872812135356e-05, "loss": 0.4252, "step": 624 }, { "epoch": 0.984251968503937, "grad_norm": 0.35222828393733996, "learning_rate": 3.7339556592765465e-05, "loss": 0.4344, "step": 625 }, { "epoch": 0.9858267716535433, "grad_norm": 0.38205340598068144, "learning_rate": 3.731038506417737e-05, "loss": 0.4277, "step": 626 }, { "epoch": 0.9874015748031496, "grad_norm": 0.33170307703427054, "learning_rate": 3.728121353558927e-05, "loss": 0.4304, "step": 627 }, { "epoch": 0.988976377952756, "grad_norm": 0.35922038766319464, "learning_rate": 3.725204200700117e-05, "loss": 0.4221, "step": 628 }, { "epoch": 0.9905511811023622, "grad_norm": 0.3757740673222827, "learning_rate": 3.722287047841307e-05, "loss": 0.4295, "step": 629 }, { "epoch": 0.9921259842519685, "grad_norm": 0.38341267811209323, "learning_rate": 3.719369894982497e-05, "loss": 0.4355, "step": 630 }, { "epoch": 0.9937007874015747, "grad_norm": 0.3981342414281751, "learning_rate": 3.716452742123687e-05, "loss": 0.41, "step": 631 }, { "epoch": 0.9952755905511811, "grad_norm": 0.3879070637921747, "learning_rate": 3.7135355892648775e-05, "loss": 0.4146, "step": 632 }, { "epoch": 0.9968503937007874, "grad_norm": 0.4454450487433733, "learning_rate": 3.710618436406068e-05, "loss": 0.4113, "step": 633 }, { "epoch": 0.9984251968503937, "grad_norm": 0.3463441206306324, "learning_rate": 3.707701283547258e-05, "loss": 0.4094, "step": 634 }, { "epoch": 1.0, "grad_norm": 0.47048384573369845, "learning_rate": 3.704784130688448e-05, "loss": 0.408, "step": 635 }, { "epoch": 1.0015748031496063, "grad_norm": 0.389747093095725, "learning_rate": 3.701866977829638e-05, "loss": 0.3651, "step": 636 }, { "epoch": 1.0031496062992127, "grad_norm": 0.37154024040972594, "learning_rate": 3.6989498249708284e-05, "loss": 0.3522, "step": 637 }, { "epoch": 1.0047244094488188, "grad_norm": 0.33966147908023636, "learning_rate": 3.696032672112019e-05, "loss": 0.3639, "step": 638 }, { "epoch": 1.0062992125984251, "grad_norm": 0.4371396829216366, "learning_rate": 3.693115519253209e-05, "loss": 0.394, "step": 639 }, { "epoch": 1.0078740157480315, "grad_norm": 0.3392511391992037, "learning_rate": 3.690198366394399e-05, "loss": 0.3756, "step": 640 }, { "epoch": 1.0094488188976378, "grad_norm": 0.39497327840531526, "learning_rate": 3.6872812135355895e-05, "loss": 0.3585, "step": 641 }, { "epoch": 1.0110236220472442, "grad_norm": 0.3463579168758424, "learning_rate": 3.68436406067678e-05, "loss": 0.3512, "step": 642 }, { "epoch": 1.0125984251968503, "grad_norm": 0.40498929656601707, "learning_rate": 3.68144690781797e-05, "loss": 0.3602, "step": 643 }, { "epoch": 1.0141732283464566, "grad_norm": 0.3579742126282003, "learning_rate": 3.67852975495916e-05, "loss": 0.3496, "step": 644 }, { "epoch": 1.015748031496063, "grad_norm": 0.42857447678645183, "learning_rate": 3.67561260210035e-05, "loss": 0.3661, "step": 645 }, { "epoch": 1.0173228346456693, "grad_norm": 0.34595131991807954, "learning_rate": 3.6726954492415404e-05, "loss": 0.376, "step": 646 }, { "epoch": 1.0188976377952756, "grad_norm": 0.43270887387606954, "learning_rate": 3.66977829638273e-05, "loss": 0.358, "step": 647 }, { "epoch": 1.020472440944882, "grad_norm": 0.3876109212206472, "learning_rate": 3.6668611435239206e-05, "loss": 0.3547, "step": 648 }, { "epoch": 1.022047244094488, "grad_norm": 0.3659631612865057, "learning_rate": 3.663943990665111e-05, "loss": 0.3673, "step": 649 }, { "epoch": 1.0236220472440944, "grad_norm": 0.35072586223547786, "learning_rate": 3.661026837806301e-05, "loss": 0.3654, "step": 650 }, { "epoch": 1.0251968503937008, "grad_norm": 0.30324839202925785, "learning_rate": 3.658109684947491e-05, "loss": 0.3429, "step": 651 }, { "epoch": 1.0267716535433071, "grad_norm": 0.3709722608873786, "learning_rate": 3.655192532088682e-05, "loss": 0.3598, "step": 652 }, { "epoch": 1.0283464566929135, "grad_norm": 0.3624274789162988, "learning_rate": 3.652275379229872e-05, "loss": 0.378, "step": 653 }, { "epoch": 1.0299212598425196, "grad_norm": 0.37137550382261514, "learning_rate": 3.649358226371062e-05, "loss": 0.3693, "step": 654 }, { "epoch": 1.031496062992126, "grad_norm": 0.3565434319815368, "learning_rate": 3.6464410735122524e-05, "loss": 0.3672, "step": 655 }, { "epoch": 1.0330708661417323, "grad_norm": 0.38084814251560084, "learning_rate": 3.643523920653442e-05, "loss": 0.3524, "step": 656 }, { "epoch": 1.0346456692913386, "grad_norm": 0.32747044241638795, "learning_rate": 3.6406067677946326e-05, "loss": 0.3544, "step": 657 }, { "epoch": 1.036220472440945, "grad_norm": 0.37216824980984775, "learning_rate": 3.637689614935823e-05, "loss": 0.3584, "step": 658 }, { "epoch": 1.0377952755905513, "grad_norm": 0.34076095101438875, "learning_rate": 3.634772462077013e-05, "loss": 0.3695, "step": 659 }, { "epoch": 1.0393700787401574, "grad_norm": 0.33071335116138806, "learning_rate": 3.631855309218203e-05, "loss": 0.3519, "step": 660 }, { "epoch": 1.0409448818897638, "grad_norm": 0.3469334740009156, "learning_rate": 3.628938156359393e-05, "loss": 0.3637, "step": 661 }, { "epoch": 1.04251968503937, "grad_norm": 0.28934927250702003, "learning_rate": 3.6260210035005835e-05, "loss": 0.3459, "step": 662 }, { "epoch": 1.0440944881889764, "grad_norm": 0.3149200746627072, "learning_rate": 3.623103850641773e-05, "loss": 0.3827, "step": 663 }, { "epoch": 1.0456692913385828, "grad_norm": 0.3252817299491694, "learning_rate": 3.620186697782964e-05, "loss": 0.3712, "step": 664 }, { "epoch": 1.047244094488189, "grad_norm": 0.2951437283218739, "learning_rate": 3.617269544924154e-05, "loss": 0.355, "step": 665 }, { "epoch": 1.0488188976377952, "grad_norm": 0.31194633347634665, "learning_rate": 3.6143523920653446e-05, "loss": 0.3424, "step": 666 }, { "epoch": 1.0503937007874016, "grad_norm": 0.29277855769160616, "learning_rate": 3.611435239206535e-05, "loss": 0.3867, "step": 667 }, { "epoch": 1.051968503937008, "grad_norm": 0.3580132334538665, "learning_rate": 3.608518086347725e-05, "loss": 0.3568, "step": 668 }, { "epoch": 1.0535433070866143, "grad_norm": 0.29902104391204387, "learning_rate": 3.605600933488915e-05, "loss": 0.3586, "step": 669 }, { "epoch": 1.0551181102362204, "grad_norm": 0.3002086113085408, "learning_rate": 3.602683780630105e-05, "loss": 0.3519, "step": 670 }, { "epoch": 1.0566929133858267, "grad_norm": 0.299046311074371, "learning_rate": 3.5997666277712955e-05, "loss": 0.3406, "step": 671 }, { "epoch": 1.058267716535433, "grad_norm": 0.2746441366562475, "learning_rate": 3.596849474912485e-05, "loss": 0.3555, "step": 672 }, { "epoch": 1.0598425196850394, "grad_norm": 0.30020714518087765, "learning_rate": 3.593932322053676e-05, "loss": 0.35, "step": 673 }, { "epoch": 1.0614173228346457, "grad_norm": 0.3312377354102601, "learning_rate": 3.591015169194866e-05, "loss": 0.3887, "step": 674 }, { "epoch": 1.0629921259842519, "grad_norm": 0.28717205978701144, "learning_rate": 3.588098016336056e-05, "loss": 0.3731, "step": 675 }, { "epoch": 1.0645669291338582, "grad_norm": 0.3376924099372287, "learning_rate": 3.5851808634772464e-05, "loss": 0.3679, "step": 676 }, { "epoch": 1.0661417322834645, "grad_norm": 0.390661004081781, "learning_rate": 3.582263710618436e-05, "loss": 0.3785, "step": 677 }, { "epoch": 1.0677165354330709, "grad_norm": 0.280791696308288, "learning_rate": 3.5793465577596266e-05, "loss": 0.344, "step": 678 }, { "epoch": 1.0692913385826772, "grad_norm": 0.35917813657603737, "learning_rate": 3.576429404900817e-05, "loss": 0.3675, "step": 679 }, { "epoch": 1.0708661417322836, "grad_norm": 0.3687094250150882, "learning_rate": 3.5735122520420075e-05, "loss": 0.365, "step": 680 }, { "epoch": 1.0724409448818897, "grad_norm": 0.43039597657460477, "learning_rate": 3.570595099183197e-05, "loss": 0.3782, "step": 681 }, { "epoch": 1.074015748031496, "grad_norm": 0.36691173470964694, "learning_rate": 3.567677946324388e-05, "loss": 0.3568, "step": 682 }, { "epoch": 1.0755905511811024, "grad_norm": 0.5491582493396894, "learning_rate": 3.564760793465578e-05, "loss": 0.3575, "step": 683 }, { "epoch": 1.0771653543307087, "grad_norm": 0.29655999820999185, "learning_rate": 3.561843640606768e-05, "loss": 0.3511, "step": 684 }, { "epoch": 1.078740157480315, "grad_norm": 0.5273713876586933, "learning_rate": 3.5589264877479584e-05, "loss": 0.3791, "step": 685 }, { "epoch": 1.0803149606299212, "grad_norm": 0.3759316320284721, "learning_rate": 3.556009334889148e-05, "loss": 0.348, "step": 686 }, { "epoch": 1.0818897637795275, "grad_norm": 0.42614548884709197, "learning_rate": 3.5530921820303386e-05, "loss": 0.3728, "step": 687 }, { "epoch": 1.0834645669291338, "grad_norm": 0.43018934877674964, "learning_rate": 3.5501750291715283e-05, "loss": 0.3721, "step": 688 }, { "epoch": 1.0850393700787402, "grad_norm": 0.40954767389850255, "learning_rate": 3.547257876312719e-05, "loss": 0.3654, "step": 689 }, { "epoch": 1.0866141732283465, "grad_norm": 0.4922932678435167, "learning_rate": 3.5443407234539086e-05, "loss": 0.3757, "step": 690 }, { "epoch": 1.0881889763779529, "grad_norm": 0.3642726003806653, "learning_rate": 3.541423570595099e-05, "loss": 0.3521, "step": 691 }, { "epoch": 1.089763779527559, "grad_norm": 0.4137468612008206, "learning_rate": 3.5385064177362895e-05, "loss": 0.3605, "step": 692 }, { "epoch": 1.0913385826771653, "grad_norm": 0.42649471354965274, "learning_rate": 3.53558926487748e-05, "loss": 0.3642, "step": 693 }, { "epoch": 1.0929133858267717, "grad_norm": 0.32253937734270816, "learning_rate": 3.5326721120186703e-05, "loss": 0.3758, "step": 694 }, { "epoch": 1.094488188976378, "grad_norm": 0.4253552010177525, "learning_rate": 3.52975495915986e-05, "loss": 0.3562, "step": 695 }, { "epoch": 1.0960629921259843, "grad_norm": 0.3625997063893861, "learning_rate": 3.5268378063010506e-05, "loss": 0.3593, "step": 696 }, { "epoch": 1.0976377952755905, "grad_norm": 0.31185394009892914, "learning_rate": 3.52392065344224e-05, "loss": 0.3625, "step": 697 }, { "epoch": 1.0992125984251968, "grad_norm": 0.462764151727316, "learning_rate": 3.521003500583431e-05, "loss": 0.3983, "step": 698 }, { "epoch": 1.1007874015748031, "grad_norm": 0.3173077987619563, "learning_rate": 3.518086347724621e-05, "loss": 0.3639, "step": 699 }, { "epoch": 1.1023622047244095, "grad_norm": 0.3850260331602656, "learning_rate": 3.515169194865811e-05, "loss": 0.3437, "step": 700 }, { "epoch": 1.1039370078740158, "grad_norm": 0.4010254695451832, "learning_rate": 3.5122520420070014e-05, "loss": 0.3741, "step": 701 }, { "epoch": 1.105511811023622, "grad_norm": 0.3379905372557732, "learning_rate": 3.509334889148191e-05, "loss": 0.3497, "step": 702 }, { "epoch": 1.1070866141732283, "grad_norm": 0.38255514131056534, "learning_rate": 3.5064177362893817e-05, "loss": 0.3691, "step": 703 }, { "epoch": 1.1086614173228346, "grad_norm": 0.36106455471948967, "learning_rate": 3.5035005834305714e-05, "loss": 0.3689, "step": 704 }, { "epoch": 1.110236220472441, "grad_norm": 0.3358150288925636, "learning_rate": 3.500583430571762e-05, "loss": 0.3683, "step": 705 }, { "epoch": 1.1118110236220473, "grad_norm": 0.35037247342774674, "learning_rate": 3.497666277712952e-05, "loss": 0.3538, "step": 706 }, { "epoch": 1.1133858267716534, "grad_norm": 0.31522563391262437, "learning_rate": 3.494749124854143e-05, "loss": 0.3484, "step": 707 }, { "epoch": 1.1149606299212598, "grad_norm": 0.35645805851041823, "learning_rate": 3.491831971995333e-05, "loss": 0.3607, "step": 708 }, { "epoch": 1.1165354330708661, "grad_norm": 0.3099299581103233, "learning_rate": 3.488914819136523e-05, "loss": 0.3482, "step": 709 }, { "epoch": 1.1181102362204725, "grad_norm": 0.3583025185552557, "learning_rate": 3.4859976662777134e-05, "loss": 0.3792, "step": 710 }, { "epoch": 1.1196850393700788, "grad_norm": 0.32613669909261245, "learning_rate": 3.483080513418903e-05, "loss": 0.3629, "step": 711 }, { "epoch": 1.1212598425196851, "grad_norm": 0.4064806162538714, "learning_rate": 3.4801633605600936e-05, "loss": 0.3833, "step": 712 }, { "epoch": 1.1228346456692913, "grad_norm": 0.29294182391862245, "learning_rate": 3.4772462077012834e-05, "loss": 0.3357, "step": 713 }, { "epoch": 1.1244094488188976, "grad_norm": 0.41210314978236817, "learning_rate": 3.474329054842474e-05, "loss": 0.3773, "step": 714 }, { "epoch": 1.125984251968504, "grad_norm": 0.26480405922351, "learning_rate": 3.4714119019836636e-05, "loss": 0.3671, "step": 715 }, { "epoch": 1.1275590551181103, "grad_norm": 0.31576865720971103, "learning_rate": 3.468494749124854e-05, "loss": 0.3772, "step": 716 }, { "epoch": 1.1291338582677166, "grad_norm": 0.3324232866676517, "learning_rate": 3.4655775962660445e-05, "loss": 0.3572, "step": 717 }, { "epoch": 1.130708661417323, "grad_norm": 0.33467471200102655, "learning_rate": 3.462660443407234e-05, "loss": 0.3478, "step": 718 }, { "epoch": 1.132283464566929, "grad_norm": 0.33468212873740677, "learning_rate": 3.459743290548425e-05, "loss": 0.3625, "step": 719 }, { "epoch": 1.1338582677165354, "grad_norm": 0.3015345455407294, "learning_rate": 3.456826137689615e-05, "loss": 0.3518, "step": 720 }, { "epoch": 1.1354330708661418, "grad_norm": 0.3385865103785134, "learning_rate": 3.4539089848308056e-05, "loss": 0.3607, "step": 721 }, { "epoch": 1.137007874015748, "grad_norm": 0.2726001047957336, "learning_rate": 3.4509918319719954e-05, "loss": 0.3758, "step": 722 }, { "epoch": 1.1385826771653544, "grad_norm": 0.33330408483860613, "learning_rate": 3.448074679113186e-05, "loss": 0.3843, "step": 723 }, { "epoch": 1.1401574803149606, "grad_norm": 0.31545809064661634, "learning_rate": 3.445157526254376e-05, "loss": 0.3548, "step": 724 }, { "epoch": 1.141732283464567, "grad_norm": 0.33758341919272994, "learning_rate": 3.442240373395566e-05, "loss": 0.3595, "step": 725 }, { "epoch": 1.1433070866141732, "grad_norm": 0.34946364868562946, "learning_rate": 3.4393232205367565e-05, "loss": 0.3728, "step": 726 }, { "epoch": 1.1448818897637796, "grad_norm": 0.3005108895361123, "learning_rate": 3.436406067677946e-05, "loss": 0.3364, "step": 727 }, { "epoch": 1.146456692913386, "grad_norm": 0.39222258521845427, "learning_rate": 3.433488914819137e-05, "loss": 0.3486, "step": 728 }, { "epoch": 1.148031496062992, "grad_norm": 0.3537165303815825, "learning_rate": 3.4305717619603265e-05, "loss": 0.3675, "step": 729 }, { "epoch": 1.1496062992125984, "grad_norm": 0.3786865755761858, "learning_rate": 3.427654609101517e-05, "loss": 0.383, "step": 730 }, { "epoch": 1.1511811023622047, "grad_norm": 0.3275024177893699, "learning_rate": 3.424737456242707e-05, "loss": 0.3449, "step": 731 }, { "epoch": 1.152755905511811, "grad_norm": 0.3607181613092288, "learning_rate": 3.421820303383897e-05, "loss": 0.3695, "step": 732 }, { "epoch": 1.1543307086614174, "grad_norm": 0.35081567931522956, "learning_rate": 3.4189031505250876e-05, "loss": 0.3959, "step": 733 }, { "epoch": 1.1559055118110235, "grad_norm": 0.3419506851727794, "learning_rate": 3.415985997666278e-05, "loss": 0.3985, "step": 734 }, { "epoch": 1.1574803149606299, "grad_norm": 0.31946693749598154, "learning_rate": 3.4130688448074685e-05, "loss": 0.359, "step": 735 }, { "epoch": 1.1590551181102362, "grad_norm": 0.35119649633888994, "learning_rate": 3.410151691948658e-05, "loss": 0.3588, "step": 736 }, { "epoch": 1.1606299212598425, "grad_norm": 0.33604321964761324, "learning_rate": 3.407234539089849e-05, "loss": 0.3501, "step": 737 }, { "epoch": 1.1622047244094489, "grad_norm": 0.2920093903115403, "learning_rate": 3.4043173862310385e-05, "loss": 0.3539, "step": 738 }, { "epoch": 1.163779527559055, "grad_norm": 0.3193525823337497, "learning_rate": 3.401400233372229e-05, "loss": 0.3651, "step": 739 }, { "epoch": 1.1653543307086613, "grad_norm": 0.3356171497516235, "learning_rate": 3.398483080513419e-05, "loss": 0.3616, "step": 740 }, { "epoch": 1.1669291338582677, "grad_norm": 0.32010993699175844, "learning_rate": 3.395565927654609e-05, "loss": 0.3554, "step": 741 }, { "epoch": 1.168503937007874, "grad_norm": 0.3039327471675238, "learning_rate": 3.3926487747957996e-05, "loss": 0.3607, "step": 742 }, { "epoch": 1.1700787401574804, "grad_norm": 0.3522629755065416, "learning_rate": 3.3897316219369894e-05, "loss": 0.3727, "step": 743 }, { "epoch": 1.1716535433070867, "grad_norm": 0.2920245729881075, "learning_rate": 3.38681446907818e-05, "loss": 0.3802, "step": 744 }, { "epoch": 1.1732283464566928, "grad_norm": 0.2906171097208711, "learning_rate": 3.3838973162193696e-05, "loss": 0.3632, "step": 745 }, { "epoch": 1.1748031496062992, "grad_norm": 0.2895835393623222, "learning_rate": 3.38098016336056e-05, "loss": 0.3527, "step": 746 }, { "epoch": 1.1763779527559055, "grad_norm": 0.3048764771346379, "learning_rate": 3.3780630105017505e-05, "loss": 0.3627, "step": 747 }, { "epoch": 1.1779527559055119, "grad_norm": 0.31505882192181983, "learning_rate": 3.375145857642941e-05, "loss": 0.3442, "step": 748 }, { "epoch": 1.1795275590551182, "grad_norm": 0.27759645831699015, "learning_rate": 3.3722287047841314e-05, "loss": 0.3385, "step": 749 }, { "epoch": 1.1811023622047245, "grad_norm": 0.34397110910690365, "learning_rate": 3.369311551925321e-05, "loss": 0.3702, "step": 750 }, { "epoch": 1.1826771653543307, "grad_norm": 0.33213883925906806, "learning_rate": 3.3663943990665116e-05, "loss": 0.3622, "step": 751 }, { "epoch": 1.184251968503937, "grad_norm": 0.26610528259273386, "learning_rate": 3.3634772462077014e-05, "loss": 0.3648, "step": 752 }, { "epoch": 1.1858267716535433, "grad_norm": 0.38796634492893367, "learning_rate": 3.360560093348892e-05, "loss": 0.3566, "step": 753 }, { "epoch": 1.1874015748031497, "grad_norm": 0.3276719048885679, "learning_rate": 3.3576429404900816e-05, "loss": 0.3754, "step": 754 }, { "epoch": 1.188976377952756, "grad_norm": 0.2953406126812746, "learning_rate": 3.354725787631272e-05, "loss": 0.3402, "step": 755 }, { "epoch": 1.1905511811023621, "grad_norm": 0.28935381214407335, "learning_rate": 3.351808634772462e-05, "loss": 0.3524, "step": 756 }, { "epoch": 1.1921259842519685, "grad_norm": 0.34836962677148475, "learning_rate": 3.348891481913652e-05, "loss": 0.3672, "step": 757 }, { "epoch": 1.1937007874015748, "grad_norm": 0.3166215750755353, "learning_rate": 3.345974329054843e-05, "loss": 0.3824, "step": 758 }, { "epoch": 1.1952755905511812, "grad_norm": 0.35687442641820605, "learning_rate": 3.3430571761960325e-05, "loss": 0.3484, "step": 759 }, { "epoch": 1.1968503937007875, "grad_norm": 0.351941176281037, "learning_rate": 3.340140023337223e-05, "loss": 0.3771, "step": 760 }, { "epoch": 1.1984251968503936, "grad_norm": 0.3384343370240843, "learning_rate": 3.3372228704784133e-05, "loss": 0.3828, "step": 761 }, { "epoch": 1.2, "grad_norm": 0.34487633438849513, "learning_rate": 3.334305717619604e-05, "loss": 0.3654, "step": 762 }, { "epoch": 1.2015748031496063, "grad_norm": 0.37608929992030454, "learning_rate": 3.3313885647607936e-05, "loss": 0.3699, "step": 763 }, { "epoch": 1.2031496062992126, "grad_norm": 0.3265103722176317, "learning_rate": 3.328471411901984e-05, "loss": 0.3737, "step": 764 }, { "epoch": 1.204724409448819, "grad_norm": 0.3081326880147797, "learning_rate": 3.325554259043174e-05, "loss": 0.3773, "step": 765 }, { "epoch": 1.206299212598425, "grad_norm": 0.3336229977709828, "learning_rate": 3.322637106184364e-05, "loss": 0.3634, "step": 766 }, { "epoch": 1.2078740157480314, "grad_norm": 0.2807590318386093, "learning_rate": 3.319719953325555e-05, "loss": 0.352, "step": 767 }, { "epoch": 1.2094488188976378, "grad_norm": 0.29367270901218984, "learning_rate": 3.3168028004667444e-05, "loss": 0.3654, "step": 768 }, { "epoch": 1.2110236220472441, "grad_norm": 0.2875532270136273, "learning_rate": 3.313885647607935e-05, "loss": 0.3497, "step": 769 }, { "epoch": 1.2125984251968505, "grad_norm": 0.30598197018243345, "learning_rate": 3.3109684947491247e-05, "loss": 0.3478, "step": 770 }, { "epoch": 1.2141732283464566, "grad_norm": 0.29563272816067243, "learning_rate": 3.308051341890315e-05, "loss": 0.3726, "step": 771 }, { "epoch": 1.215748031496063, "grad_norm": 0.2975103465348826, "learning_rate": 3.305134189031505e-05, "loss": 0.3532, "step": 772 }, { "epoch": 1.2173228346456693, "grad_norm": 0.28080690838457484, "learning_rate": 3.302217036172695e-05, "loss": 0.3769, "step": 773 }, { "epoch": 1.2188976377952756, "grad_norm": 0.32349190554356694, "learning_rate": 3.299299883313886e-05, "loss": 0.3532, "step": 774 }, { "epoch": 1.220472440944882, "grad_norm": 0.2857796657088395, "learning_rate": 3.296382730455076e-05, "loss": 0.3603, "step": 775 }, { "epoch": 1.2220472440944883, "grad_norm": 0.30349442589166775, "learning_rate": 3.293465577596267e-05, "loss": 0.3617, "step": 776 }, { "epoch": 1.2236220472440944, "grad_norm": 0.27466864459847856, "learning_rate": 3.2905484247374564e-05, "loss": 0.3645, "step": 777 }, { "epoch": 1.2251968503937007, "grad_norm": 0.28452874482803336, "learning_rate": 3.287631271878647e-05, "loss": 0.3427, "step": 778 }, { "epoch": 1.226771653543307, "grad_norm": 0.2638089047698697, "learning_rate": 3.2847141190198366e-05, "loss": 0.3666, "step": 779 }, { "epoch": 1.2283464566929134, "grad_norm": 0.3078145194522091, "learning_rate": 3.281796966161027e-05, "loss": 0.3645, "step": 780 }, { "epoch": 1.2299212598425198, "grad_norm": 0.31629943755661644, "learning_rate": 3.278879813302217e-05, "loss": 0.3874, "step": 781 }, { "epoch": 1.231496062992126, "grad_norm": 0.28683150847721994, "learning_rate": 3.275962660443407e-05, "loss": 0.3467, "step": 782 }, { "epoch": 1.2330708661417322, "grad_norm": 0.348145704679024, "learning_rate": 3.273045507584598e-05, "loss": 0.3392, "step": 783 }, { "epoch": 1.2346456692913386, "grad_norm": 0.283665955949929, "learning_rate": 3.2701283547257875e-05, "loss": 0.3444, "step": 784 }, { "epoch": 1.236220472440945, "grad_norm": 0.27289245183113353, "learning_rate": 3.267211201866978e-05, "loss": 0.3521, "step": 785 }, { "epoch": 1.2377952755905512, "grad_norm": 0.29104227693168794, "learning_rate": 3.264294049008168e-05, "loss": 0.357, "step": 786 }, { "epoch": 1.2393700787401576, "grad_norm": 0.34879854338469973, "learning_rate": 3.261376896149358e-05, "loss": 0.3645, "step": 787 }, { "epoch": 1.2409448818897637, "grad_norm": 0.35669510337551863, "learning_rate": 3.2584597432905486e-05, "loss": 0.3603, "step": 788 }, { "epoch": 1.24251968503937, "grad_norm": 0.33915238596232006, "learning_rate": 3.255542590431739e-05, "loss": 0.3775, "step": 789 }, { "epoch": 1.2440944881889764, "grad_norm": 0.3995001805450645, "learning_rate": 3.252625437572929e-05, "loss": 0.3875, "step": 790 }, { "epoch": 1.2456692913385827, "grad_norm": 0.2844023943836169, "learning_rate": 3.249708284714119e-05, "loss": 0.3596, "step": 791 }, { "epoch": 1.247244094488189, "grad_norm": 0.3580299652285552, "learning_rate": 3.24679113185531e-05, "loss": 0.3582, "step": 792 }, { "epoch": 1.2488188976377952, "grad_norm": 0.30433143051810013, "learning_rate": 3.2438739789964995e-05, "loss": 0.3705, "step": 793 }, { "epoch": 1.2503937007874015, "grad_norm": 0.36430968290324145, "learning_rate": 3.24095682613769e-05, "loss": 0.3623, "step": 794 }, { "epoch": 1.2519685039370079, "grad_norm": 0.35435739952922096, "learning_rate": 3.23803967327888e-05, "loss": 0.3569, "step": 795 }, { "epoch": 1.2535433070866142, "grad_norm": 0.39376288630880407, "learning_rate": 3.23512252042007e-05, "loss": 0.3821, "step": 796 }, { "epoch": 1.2551181102362206, "grad_norm": 0.28911410804574866, "learning_rate": 3.23220536756126e-05, "loss": 0.3565, "step": 797 }, { "epoch": 1.2566929133858267, "grad_norm": 0.38679455248132705, "learning_rate": 3.2292882147024504e-05, "loss": 0.3775, "step": 798 }, { "epoch": 1.258267716535433, "grad_norm": 0.316796880593531, "learning_rate": 3.226371061843641e-05, "loss": 0.3768, "step": 799 }, { "epoch": 1.2598425196850394, "grad_norm": 0.3133324346651273, "learning_rate": 3.2234539089848306e-05, "loss": 0.3525, "step": 800 }, { "epoch": 1.2614173228346457, "grad_norm": 0.3299104635223967, "learning_rate": 3.220536756126021e-05, "loss": 0.3702, "step": 801 }, { "epoch": 1.262992125984252, "grad_norm": 0.3387208368441943, "learning_rate": 3.2176196032672115e-05, "loss": 0.3836, "step": 802 }, { "epoch": 1.2645669291338582, "grad_norm": 0.3553756778175882, "learning_rate": 3.214702450408402e-05, "loss": 0.364, "step": 803 }, { "epoch": 1.2661417322834645, "grad_norm": 0.36810426546385083, "learning_rate": 3.211785297549592e-05, "loss": 0.3622, "step": 804 }, { "epoch": 1.2677165354330708, "grad_norm": 0.31325299438759013, "learning_rate": 3.208868144690782e-05, "loss": 0.3691, "step": 805 }, { "epoch": 1.2692913385826772, "grad_norm": 0.36708747349249377, "learning_rate": 3.205950991831972e-05, "loss": 0.3553, "step": 806 }, { "epoch": 1.2708661417322835, "grad_norm": 0.3244684636603968, "learning_rate": 3.2030338389731624e-05, "loss": 0.3582, "step": 807 }, { "epoch": 1.2724409448818896, "grad_norm": 0.26492631673800704, "learning_rate": 3.200116686114353e-05, "loss": 0.3815, "step": 808 }, { "epoch": 1.2740157480314962, "grad_norm": 0.29804323666192795, "learning_rate": 3.1971995332555426e-05, "loss": 0.3539, "step": 809 }, { "epoch": 1.2755905511811023, "grad_norm": 0.3332770605648795, "learning_rate": 3.194282380396733e-05, "loss": 0.3514, "step": 810 }, { "epoch": 1.2771653543307087, "grad_norm": 0.2837936197621532, "learning_rate": 3.191365227537923e-05, "loss": 0.3859, "step": 811 }, { "epoch": 1.278740157480315, "grad_norm": 0.2840418528673192, "learning_rate": 3.188448074679113e-05, "loss": 0.357, "step": 812 }, { "epoch": 1.2803149606299213, "grad_norm": 0.32101038110622776, "learning_rate": 3.185530921820303e-05, "loss": 0.3453, "step": 813 }, { "epoch": 1.2818897637795277, "grad_norm": 0.29405808520124055, "learning_rate": 3.1826137689614935e-05, "loss": 0.3806, "step": 814 }, { "epoch": 1.2834645669291338, "grad_norm": 0.29610508390212054, "learning_rate": 3.179696616102684e-05, "loss": 0.3646, "step": 815 }, { "epoch": 1.2850393700787401, "grad_norm": 0.3052861742495208, "learning_rate": 3.1767794632438744e-05, "loss": 0.3482, "step": 816 }, { "epoch": 1.2866141732283465, "grad_norm": 0.2874864526371896, "learning_rate": 3.173862310385065e-05, "loss": 0.3603, "step": 817 }, { "epoch": 1.2881889763779528, "grad_norm": 0.3519780262053362, "learning_rate": 3.1709451575262546e-05, "loss": 0.3685, "step": 818 }, { "epoch": 1.2897637795275592, "grad_norm": 0.275814345741856, "learning_rate": 3.168028004667445e-05, "loss": 0.3674, "step": 819 }, { "epoch": 1.2913385826771653, "grad_norm": 0.3820657270917037, "learning_rate": 3.165110851808635e-05, "loss": 0.3638, "step": 820 }, { "epoch": 1.2929133858267716, "grad_norm": 0.27750577248959635, "learning_rate": 3.162193698949825e-05, "loss": 0.3581, "step": 821 }, { "epoch": 1.294488188976378, "grad_norm": 0.32493835660484427, "learning_rate": 3.159276546091015e-05, "loss": 0.3791, "step": 822 }, { "epoch": 1.2960629921259843, "grad_norm": 0.3102242200712811, "learning_rate": 3.1563593932322055e-05, "loss": 0.3684, "step": 823 }, { "epoch": 1.2976377952755906, "grad_norm": 0.2817084345195584, "learning_rate": 3.153442240373396e-05, "loss": 0.3699, "step": 824 }, { "epoch": 1.2992125984251968, "grad_norm": 0.2930105096910059, "learning_rate": 3.150525087514586e-05, "loss": 0.3422, "step": 825 }, { "epoch": 1.300787401574803, "grad_norm": 0.3159832160931849, "learning_rate": 3.147607934655776e-05, "loss": 0.3552, "step": 826 }, { "epoch": 1.3023622047244094, "grad_norm": 0.28163660088533254, "learning_rate": 3.144690781796966e-05, "loss": 0.3739, "step": 827 }, { "epoch": 1.3039370078740158, "grad_norm": 0.3416863642581504, "learning_rate": 3.1417736289381564e-05, "loss": 0.364, "step": 828 }, { "epoch": 1.3055118110236221, "grad_norm": 0.2708098964497504, "learning_rate": 3.138856476079347e-05, "loss": 0.3502, "step": 829 }, { "epoch": 1.3070866141732282, "grad_norm": 0.3008562473485546, "learning_rate": 3.135939323220537e-05, "loss": 0.3447, "step": 830 }, { "epoch": 1.3086614173228346, "grad_norm": 0.38104125503881026, "learning_rate": 3.133022170361727e-05, "loss": 0.3704, "step": 831 }, { "epoch": 1.310236220472441, "grad_norm": 0.30927706335897914, "learning_rate": 3.1301050175029175e-05, "loss": 0.3669, "step": 832 }, { "epoch": 1.3118110236220473, "grad_norm": 0.3131173440754822, "learning_rate": 3.127187864644108e-05, "loss": 0.3754, "step": 833 }, { "epoch": 1.3133858267716536, "grad_norm": 0.29022020807547627, "learning_rate": 3.124270711785298e-05, "loss": 0.3543, "step": 834 }, { "epoch": 1.3149606299212597, "grad_norm": 0.30661421224611823, "learning_rate": 3.121353558926488e-05, "loss": 0.3609, "step": 835 }, { "epoch": 1.316535433070866, "grad_norm": 0.29788051467687154, "learning_rate": 3.118436406067678e-05, "loss": 0.3608, "step": 836 }, { "epoch": 1.3181102362204724, "grad_norm": 0.3195699148882896, "learning_rate": 3.1155192532088683e-05, "loss": 0.3412, "step": 837 }, { "epoch": 1.3196850393700787, "grad_norm": 0.3143323584809881, "learning_rate": 3.112602100350058e-05, "loss": 0.3685, "step": 838 }, { "epoch": 1.321259842519685, "grad_norm": 0.35225410611536917, "learning_rate": 3.1096849474912486e-05, "loss": 0.3546, "step": 839 }, { "epoch": 1.3228346456692912, "grad_norm": 0.2737280456431728, "learning_rate": 3.106767794632439e-05, "loss": 0.3478, "step": 840 }, { "epoch": 1.3244094488188978, "grad_norm": 0.31191046636474445, "learning_rate": 3.103850641773629e-05, "loss": 0.3579, "step": 841 }, { "epoch": 1.325984251968504, "grad_norm": 0.3331971445112432, "learning_rate": 3.100933488914819e-05, "loss": 0.3533, "step": 842 }, { "epoch": 1.3275590551181102, "grad_norm": 0.2841600936343056, "learning_rate": 3.09801633605601e-05, "loss": 0.3815, "step": 843 }, { "epoch": 1.3291338582677166, "grad_norm": 0.2936730643041818, "learning_rate": 3.0950991831972e-05, "loss": 0.3608, "step": 844 }, { "epoch": 1.330708661417323, "grad_norm": 0.27210224745659384, "learning_rate": 3.09218203033839e-05, "loss": 0.373, "step": 845 }, { "epoch": 1.3322834645669293, "grad_norm": 0.3139024265832329, "learning_rate": 3.08926487747958e-05, "loss": 0.3904, "step": 846 }, { "epoch": 1.3338582677165354, "grad_norm": 0.29350539800496533, "learning_rate": 3.08634772462077e-05, "loss": 0.3649, "step": 847 }, { "epoch": 1.3354330708661417, "grad_norm": 0.3010593499529191, "learning_rate": 3.0834305717619605e-05, "loss": 0.3525, "step": 848 }, { "epoch": 1.337007874015748, "grad_norm": 0.3834453137400492, "learning_rate": 3.080513418903151e-05, "loss": 0.3739, "step": 849 }, { "epoch": 1.3385826771653544, "grad_norm": 0.3013096667388302, "learning_rate": 3.077596266044341e-05, "loss": 0.3407, "step": 850 }, { "epoch": 1.3401574803149607, "grad_norm": 0.3212348643063403, "learning_rate": 3.074679113185531e-05, "loss": 0.3553, "step": 851 }, { "epoch": 1.3417322834645669, "grad_norm": 0.2895567776022644, "learning_rate": 3.071761960326721e-05, "loss": 0.3625, "step": 852 }, { "epoch": 1.3433070866141732, "grad_norm": 0.3209600796397439, "learning_rate": 3.0688448074679114e-05, "loss": 0.3748, "step": 853 }, { "epoch": 1.3448818897637795, "grad_norm": 0.2927656622363561, "learning_rate": 3.065927654609101e-05, "loss": 0.3525, "step": 854 }, { "epoch": 1.3464566929133859, "grad_norm": 0.29779989515977373, "learning_rate": 3.0630105017502916e-05, "loss": 0.3371, "step": 855 }, { "epoch": 1.3480314960629922, "grad_norm": 0.3054977215284221, "learning_rate": 3.060093348891482e-05, "loss": 0.3627, "step": 856 }, { "epoch": 1.3496062992125983, "grad_norm": 0.3617903119759193, "learning_rate": 3.0571761960326725e-05, "loss": 0.3625, "step": 857 }, { "epoch": 1.3511811023622047, "grad_norm": 0.3528674755734383, "learning_rate": 3.054259043173863e-05, "loss": 0.3549, "step": 858 }, { "epoch": 1.352755905511811, "grad_norm": 0.41043469880322536, "learning_rate": 3.0513418903150524e-05, "loss": 0.3693, "step": 859 }, { "epoch": 1.3543307086614174, "grad_norm": 0.323562245641776, "learning_rate": 3.048424737456243e-05, "loss": 0.3726, "step": 860 }, { "epoch": 1.3559055118110237, "grad_norm": 0.41699778283204253, "learning_rate": 3.045507584597433e-05, "loss": 0.3813, "step": 861 }, { "epoch": 1.3574803149606298, "grad_norm": 0.38124500197498945, "learning_rate": 3.0425904317386234e-05, "loss": 0.3759, "step": 862 }, { "epoch": 1.3590551181102362, "grad_norm": 0.3980723523914118, "learning_rate": 3.0396732788798132e-05, "loss": 0.3889, "step": 863 }, { "epoch": 1.3606299212598425, "grad_norm": 0.33970759521181565, "learning_rate": 3.0367561260210036e-05, "loss": 0.3602, "step": 864 }, { "epoch": 1.3622047244094488, "grad_norm": 0.33195278273974516, "learning_rate": 3.033838973162194e-05, "loss": 0.3501, "step": 865 }, { "epoch": 1.3637795275590552, "grad_norm": 0.3697737244267441, "learning_rate": 3.030921820303384e-05, "loss": 0.3665, "step": 866 }, { "epoch": 1.3653543307086613, "grad_norm": 0.3125970982769314, "learning_rate": 3.0280046674445743e-05, "loss": 0.3714, "step": 867 }, { "epoch": 1.3669291338582676, "grad_norm": 0.35312710313773876, "learning_rate": 3.0250875145857644e-05, "loss": 0.3508, "step": 868 }, { "epoch": 1.368503937007874, "grad_norm": 0.3106637800633254, "learning_rate": 3.022170361726955e-05, "loss": 0.3549, "step": 869 }, { "epoch": 1.3700787401574803, "grad_norm": 0.29338090759103147, "learning_rate": 3.0192532088681446e-05, "loss": 0.3636, "step": 870 }, { "epoch": 1.3716535433070867, "grad_norm": 0.3217283037586182, "learning_rate": 3.016336056009335e-05, "loss": 0.3663, "step": 871 }, { "epoch": 1.3732283464566928, "grad_norm": 0.33774817734354173, "learning_rate": 3.013418903150525e-05, "loss": 0.3621, "step": 872 }, { "epoch": 1.3748031496062993, "grad_norm": 0.30793274184463043, "learning_rate": 3.0105017502917153e-05, "loss": 0.3608, "step": 873 }, { "epoch": 1.3763779527559055, "grad_norm": 0.3479217507306264, "learning_rate": 3.0075845974329057e-05, "loss": 0.3762, "step": 874 }, { "epoch": 1.3779527559055118, "grad_norm": 0.35086650648144574, "learning_rate": 3.004667444574096e-05, "loss": 0.3748, "step": 875 }, { "epoch": 1.3795275590551181, "grad_norm": 0.34956555343139023, "learning_rate": 3.0017502917152863e-05, "loss": 0.3826, "step": 876 }, { "epoch": 1.3811023622047245, "grad_norm": 0.3103271295742857, "learning_rate": 2.998833138856476e-05, "loss": 0.3551, "step": 877 }, { "epoch": 1.3826771653543308, "grad_norm": 0.3522929527213216, "learning_rate": 2.9959159859976665e-05, "loss": 0.3523, "step": 878 }, { "epoch": 1.384251968503937, "grad_norm": 0.30947542374866466, "learning_rate": 2.9929988331388563e-05, "loss": 0.363, "step": 879 }, { "epoch": 1.3858267716535433, "grad_norm": 0.36518460241165607, "learning_rate": 2.9900816802800467e-05, "loss": 0.3662, "step": 880 }, { "epoch": 1.3874015748031496, "grad_norm": 0.3074644669485209, "learning_rate": 2.9871645274212368e-05, "loss": 0.3657, "step": 881 }, { "epoch": 1.388976377952756, "grad_norm": 0.29640320728579966, "learning_rate": 2.9842473745624273e-05, "loss": 0.3563, "step": 882 }, { "epoch": 1.3905511811023623, "grad_norm": 0.30923761937976596, "learning_rate": 2.9813302217036177e-05, "loss": 0.3508, "step": 883 }, { "epoch": 1.3921259842519684, "grad_norm": 0.3698200351039963, "learning_rate": 2.9784130688448075e-05, "loss": 0.3723, "step": 884 }, { "epoch": 1.3937007874015748, "grad_norm": 0.3490878879591213, "learning_rate": 2.975495915985998e-05, "loss": 0.3639, "step": 885 }, { "epoch": 1.395275590551181, "grad_norm": 0.2972960495742603, "learning_rate": 2.9725787631271877e-05, "loss": 0.3645, "step": 886 }, { "epoch": 1.3968503937007875, "grad_norm": 0.3712565708335128, "learning_rate": 2.969661610268378e-05, "loss": 0.3556, "step": 887 }, { "epoch": 1.3984251968503938, "grad_norm": 0.4323926564028772, "learning_rate": 2.9667444574095683e-05, "loss": 0.376, "step": 888 }, { "epoch": 1.4, "grad_norm": 0.3585706955629851, "learning_rate": 2.9638273045507587e-05, "loss": 0.3507, "step": 889 }, { "epoch": 1.4015748031496063, "grad_norm": 0.34172117029734483, "learning_rate": 2.960910151691949e-05, "loss": 0.3578, "step": 890 }, { "epoch": 1.4031496062992126, "grad_norm": 0.34404589611188724, "learning_rate": 2.957992998833139e-05, "loss": 0.3815, "step": 891 }, { "epoch": 1.404724409448819, "grad_norm": 0.3768304517285473, "learning_rate": 2.9550758459743294e-05, "loss": 0.349, "step": 892 }, { "epoch": 1.4062992125984253, "grad_norm": 0.343228167858349, "learning_rate": 2.952158693115519e-05, "loss": 0.3628, "step": 893 }, { "epoch": 1.4078740157480314, "grad_norm": 0.3614351434010255, "learning_rate": 2.9492415402567096e-05, "loss": 0.3541, "step": 894 }, { "epoch": 1.4094488188976377, "grad_norm": 0.3154353440352359, "learning_rate": 2.9463243873978997e-05, "loss": 0.3563, "step": 895 }, { "epoch": 1.411023622047244, "grad_norm": 0.3658824704262604, "learning_rate": 2.94340723453909e-05, "loss": 0.3691, "step": 896 }, { "epoch": 1.4125984251968504, "grad_norm": 0.3074950373716607, "learning_rate": 2.94049008168028e-05, "loss": 0.3503, "step": 897 }, { "epoch": 1.4141732283464568, "grad_norm": 0.2846349617678375, "learning_rate": 2.9375729288214704e-05, "loss": 0.366, "step": 898 }, { "epoch": 1.4157480314960629, "grad_norm": 0.32745861228124173, "learning_rate": 2.9346557759626608e-05, "loss": 0.3475, "step": 899 }, { "epoch": 1.4173228346456692, "grad_norm": 0.3325875617530454, "learning_rate": 2.9317386231038506e-05, "loss": 0.3445, "step": 900 }, { "epoch": 1.4188976377952756, "grad_norm": 0.33020643842151753, "learning_rate": 2.928821470245041e-05, "loss": 0.3748, "step": 901 }, { "epoch": 1.420472440944882, "grad_norm": 0.3105835300246082, "learning_rate": 2.925904317386231e-05, "loss": 0.353, "step": 902 }, { "epoch": 1.4220472440944882, "grad_norm": 0.34559456063854044, "learning_rate": 2.9229871645274216e-05, "loss": 0.3442, "step": 903 }, { "epoch": 1.4236220472440944, "grad_norm": 0.2904738566439748, "learning_rate": 2.9200700116686113e-05, "loss": 0.3633, "step": 904 }, { "epoch": 1.425196850393701, "grad_norm": 0.3259696962298904, "learning_rate": 2.9171528588098018e-05, "loss": 0.3699, "step": 905 }, { "epoch": 1.426771653543307, "grad_norm": 0.355365643023126, "learning_rate": 2.9142357059509916e-05, "loss": 0.3611, "step": 906 }, { "epoch": 1.4283464566929134, "grad_norm": 0.3075151968539967, "learning_rate": 2.911318553092182e-05, "loss": 0.3638, "step": 907 }, { "epoch": 1.4299212598425197, "grad_norm": 0.3687319899834052, "learning_rate": 2.9084014002333725e-05, "loss": 0.3559, "step": 908 }, { "epoch": 1.431496062992126, "grad_norm": 0.3015577589606396, "learning_rate": 2.9054842473745626e-05, "loss": 0.3673, "step": 909 }, { "epoch": 1.4330708661417324, "grad_norm": 0.3128091393388433, "learning_rate": 2.902567094515753e-05, "loss": 0.3679, "step": 910 }, { "epoch": 1.4346456692913385, "grad_norm": 0.3223662690772937, "learning_rate": 2.8996499416569428e-05, "loss": 0.3592, "step": 911 }, { "epoch": 1.4362204724409449, "grad_norm": 0.3054083943087674, "learning_rate": 2.8967327887981332e-05, "loss": 0.3706, "step": 912 }, { "epoch": 1.4377952755905512, "grad_norm": 0.3929179842912538, "learning_rate": 2.893815635939323e-05, "loss": 0.3547, "step": 913 }, { "epoch": 1.4393700787401575, "grad_norm": 0.32193765015220166, "learning_rate": 2.8908984830805134e-05, "loss": 0.389, "step": 914 }, { "epoch": 1.4409448818897639, "grad_norm": 0.37546097285852664, "learning_rate": 2.887981330221704e-05, "loss": 0.3812, "step": 915 }, { "epoch": 1.44251968503937, "grad_norm": 0.34561470552273615, "learning_rate": 2.885064177362894e-05, "loss": 0.366, "step": 916 }, { "epoch": 1.4440944881889763, "grad_norm": 0.32528887852569566, "learning_rate": 2.8821470245040844e-05, "loss": 0.3722, "step": 917 }, { "epoch": 1.4456692913385827, "grad_norm": 0.2936156528452562, "learning_rate": 2.8792298716452742e-05, "loss": 0.3691, "step": 918 }, { "epoch": 1.447244094488189, "grad_norm": 0.3101855595732238, "learning_rate": 2.8763127187864647e-05, "loss": 0.3625, "step": 919 }, { "epoch": 1.4488188976377954, "grad_norm": 0.2863611300372958, "learning_rate": 2.8733955659276544e-05, "loss": 0.3379, "step": 920 }, { "epoch": 1.4503937007874015, "grad_norm": 0.33763728040647367, "learning_rate": 2.870478413068845e-05, "loss": 0.3624, "step": 921 }, { "epoch": 1.4519685039370078, "grad_norm": 0.31459224472470565, "learning_rate": 2.867561260210035e-05, "loss": 0.357, "step": 922 }, { "epoch": 1.4535433070866142, "grad_norm": 0.28521312886073225, "learning_rate": 2.8646441073512254e-05, "loss": 0.3697, "step": 923 }, { "epoch": 1.4551181102362205, "grad_norm": 0.38381292783493626, "learning_rate": 2.861726954492416e-05, "loss": 0.3507, "step": 924 }, { "epoch": 1.4566929133858268, "grad_norm": 0.34342312683038684, "learning_rate": 2.8588098016336056e-05, "loss": 0.3695, "step": 925 }, { "epoch": 1.458267716535433, "grad_norm": 0.320885150270499, "learning_rate": 2.855892648774796e-05, "loss": 0.3693, "step": 926 }, { "epoch": 1.4598425196850393, "grad_norm": 0.3387208042058045, "learning_rate": 2.852975495915986e-05, "loss": 0.3427, "step": 927 }, { "epoch": 1.4614173228346456, "grad_norm": 0.3048015910812705, "learning_rate": 2.8500583430571763e-05, "loss": 0.3849, "step": 928 }, { "epoch": 1.462992125984252, "grad_norm": 0.31534786904231055, "learning_rate": 2.8471411901983664e-05, "loss": 0.3642, "step": 929 }, { "epoch": 1.4645669291338583, "grad_norm": 0.32936312326226097, "learning_rate": 2.844224037339557e-05, "loss": 0.3665, "step": 930 }, { "epoch": 1.4661417322834644, "grad_norm": 0.33048014525316394, "learning_rate": 2.8413068844807466e-05, "loss": 0.3658, "step": 931 }, { "epoch": 1.4677165354330708, "grad_norm": 0.2820629968881048, "learning_rate": 2.838389731621937e-05, "loss": 0.3672, "step": 932 }, { "epoch": 1.4692913385826771, "grad_norm": 0.2983651933358318, "learning_rate": 2.8354725787631275e-05, "loss": 0.3705, "step": 933 }, { "epoch": 1.4708661417322835, "grad_norm": 0.29021884517840973, "learning_rate": 2.8325554259043173e-05, "loss": 0.3559, "step": 934 }, { "epoch": 1.4724409448818898, "grad_norm": 0.27477694486896764, "learning_rate": 2.8296382730455077e-05, "loss": 0.3569, "step": 935 }, { "epoch": 1.474015748031496, "grad_norm": 0.2563217028644257, "learning_rate": 2.826721120186698e-05, "loss": 0.3373, "step": 936 }, { "epoch": 1.4755905511811025, "grad_norm": 0.28450887988826534, "learning_rate": 2.8238039673278883e-05, "loss": 0.3502, "step": 937 }, { "epoch": 1.4771653543307086, "grad_norm": 0.27282013534140975, "learning_rate": 2.820886814469078e-05, "loss": 0.3576, "step": 938 }, { "epoch": 1.478740157480315, "grad_norm": 0.3077175658275005, "learning_rate": 2.8179696616102685e-05, "loss": 0.3525, "step": 939 }, { "epoch": 1.4803149606299213, "grad_norm": 0.30042186618743144, "learning_rate": 2.815052508751459e-05, "loss": 0.3764, "step": 940 }, { "epoch": 1.4818897637795276, "grad_norm": 0.314453106725561, "learning_rate": 2.8121353558926487e-05, "loss": 0.3427, "step": 941 }, { "epoch": 1.483464566929134, "grad_norm": 0.292804102409363, "learning_rate": 2.8092182030338392e-05, "loss": 0.3547, "step": 942 }, { "epoch": 1.48503937007874, "grad_norm": 0.27393462709199856, "learning_rate": 2.8063010501750293e-05, "loss": 0.352, "step": 943 }, { "epoch": 1.4866141732283464, "grad_norm": 0.26954639560539134, "learning_rate": 2.8033838973162197e-05, "loss": 0.3463, "step": 944 }, { "epoch": 1.4881889763779528, "grad_norm": 0.27446531908839333, "learning_rate": 2.8004667444574095e-05, "loss": 0.3551, "step": 945 }, { "epoch": 1.4897637795275591, "grad_norm": 0.2870423382693005, "learning_rate": 2.7975495915986e-05, "loss": 0.3523, "step": 946 }, { "epoch": 1.4913385826771655, "grad_norm": 0.2530000413257329, "learning_rate": 2.7946324387397897e-05, "loss": 0.3507, "step": 947 }, { "epoch": 1.4929133858267716, "grad_norm": 0.33139545629638945, "learning_rate": 2.79171528588098e-05, "loss": 0.3438, "step": 948 }, { "epoch": 1.494488188976378, "grad_norm": 0.3372755728804489, "learning_rate": 2.7887981330221706e-05, "loss": 0.3694, "step": 949 }, { "epoch": 1.4960629921259843, "grad_norm": 0.3101862121089173, "learning_rate": 2.7858809801633607e-05, "loss": 0.396, "step": 950 }, { "epoch": 1.4976377952755906, "grad_norm": 0.2753478877556497, "learning_rate": 2.782963827304551e-05, "loss": 0.3743, "step": 951 }, { "epoch": 1.499212598425197, "grad_norm": 0.3473247283492234, "learning_rate": 2.780046674445741e-05, "loss": 0.3608, "step": 952 }, { "epoch": 1.500787401574803, "grad_norm": 0.310169179051179, "learning_rate": 2.7771295215869314e-05, "loss": 0.369, "step": 953 }, { "epoch": 1.5023622047244094, "grad_norm": 0.2856648442173418, "learning_rate": 2.774212368728121e-05, "loss": 0.363, "step": 954 }, { "epoch": 1.5039370078740157, "grad_norm": 0.29545969022340896, "learning_rate": 2.7712952158693116e-05, "loss": 0.3666, "step": 955 }, { "epoch": 1.505511811023622, "grad_norm": 0.31579275320722405, "learning_rate": 2.7683780630105017e-05, "loss": 0.3601, "step": 956 }, { "epoch": 1.5070866141732284, "grad_norm": 0.32276503399852075, "learning_rate": 2.765460910151692e-05, "loss": 0.3578, "step": 957 }, { "epoch": 1.5086614173228345, "grad_norm": 0.2734957496200111, "learning_rate": 2.7625437572928826e-05, "loss": 0.3759, "step": 958 }, { "epoch": 1.510236220472441, "grad_norm": 0.28406754548077584, "learning_rate": 2.7596266044340724e-05, "loss": 0.3461, "step": 959 }, { "epoch": 1.5118110236220472, "grad_norm": 0.3302637409736247, "learning_rate": 2.7567094515752628e-05, "loss": 0.3385, "step": 960 }, { "epoch": 1.5133858267716536, "grad_norm": 0.27453854953829043, "learning_rate": 2.7537922987164526e-05, "loss": 0.35, "step": 961 }, { "epoch": 1.51496062992126, "grad_norm": 0.3144748398502992, "learning_rate": 2.750875145857643e-05, "loss": 0.3733, "step": 962 }, { "epoch": 1.516535433070866, "grad_norm": 0.2997655407623314, "learning_rate": 2.747957992998833e-05, "loss": 0.3673, "step": 963 }, { "epoch": 1.5181102362204726, "grad_norm": 0.2876413027067786, "learning_rate": 2.7450408401400236e-05, "loss": 0.3703, "step": 964 }, { "epoch": 1.5196850393700787, "grad_norm": 0.32701212545822095, "learning_rate": 2.742123687281214e-05, "loss": 0.3749, "step": 965 }, { "epoch": 1.521259842519685, "grad_norm": 0.2787641358682229, "learning_rate": 2.7392065344224038e-05, "loss": 0.357, "step": 966 }, { "epoch": 1.5228346456692914, "grad_norm": 0.31768284323055423, "learning_rate": 2.7362893815635942e-05, "loss": 0.3616, "step": 967 }, { "epoch": 1.5244094488188975, "grad_norm": 0.3048894388009916, "learning_rate": 2.733372228704784e-05, "loss": 0.3555, "step": 968 }, { "epoch": 1.525984251968504, "grad_norm": 0.30552988386129404, "learning_rate": 2.7304550758459745e-05, "loss": 0.3577, "step": 969 }, { "epoch": 1.5275590551181102, "grad_norm": 0.3068550390643556, "learning_rate": 2.7275379229871646e-05, "loss": 0.3608, "step": 970 }, { "epoch": 1.5291338582677165, "grad_norm": 0.3026553998660087, "learning_rate": 2.724620770128355e-05, "loss": 0.3645, "step": 971 }, { "epoch": 1.5307086614173229, "grad_norm": 0.3273299112511259, "learning_rate": 2.7217036172695448e-05, "loss": 0.3674, "step": 972 }, { "epoch": 1.532283464566929, "grad_norm": 0.3285747391356577, "learning_rate": 2.7187864644107352e-05, "loss": 0.3647, "step": 973 }, { "epoch": 1.5338582677165356, "grad_norm": 0.3546559916148701, "learning_rate": 2.7158693115519257e-05, "loss": 0.357, "step": 974 }, { "epoch": 1.5354330708661417, "grad_norm": 0.3330995380082502, "learning_rate": 2.7129521586931155e-05, "loss": 0.3723, "step": 975 }, { "epoch": 1.537007874015748, "grad_norm": 0.3874919915969354, "learning_rate": 2.710035005834306e-05, "loss": 0.3528, "step": 976 }, { "epoch": 1.5385826771653544, "grad_norm": 0.3490199861647204, "learning_rate": 2.707117852975496e-05, "loss": 0.3708, "step": 977 }, { "epoch": 1.5401574803149605, "grad_norm": 0.3314349291995133, "learning_rate": 2.7042007001166865e-05, "loss": 0.3674, "step": 978 }, { "epoch": 1.541732283464567, "grad_norm": 0.3623066327222135, "learning_rate": 2.7012835472578762e-05, "loss": 0.3641, "step": 979 }, { "epoch": 1.5433070866141732, "grad_norm": 0.3252672697218016, "learning_rate": 2.6983663943990667e-05, "loss": 0.3631, "step": 980 }, { "epoch": 1.5448818897637795, "grad_norm": 0.34401849786210903, "learning_rate": 2.6954492415402564e-05, "loss": 0.3596, "step": 981 }, { "epoch": 1.5464566929133858, "grad_norm": 0.41674778834362847, "learning_rate": 2.692532088681447e-05, "loss": 0.3644, "step": 982 }, { "epoch": 1.5480314960629922, "grad_norm": 0.30193600449116775, "learning_rate": 2.6896149358226373e-05, "loss": 0.3422, "step": 983 }, { "epoch": 1.5496062992125985, "grad_norm": 0.2689547777525334, "learning_rate": 2.6866977829638274e-05, "loss": 0.3705, "step": 984 }, { "epoch": 1.5511811023622046, "grad_norm": 0.3587676191669122, "learning_rate": 2.683780630105018e-05, "loss": 0.3741, "step": 985 }, { "epoch": 1.552755905511811, "grad_norm": 0.28235160830155925, "learning_rate": 2.6808634772462077e-05, "loss": 0.3719, "step": 986 }, { "epoch": 1.5543307086614173, "grad_norm": 0.28617988142223255, "learning_rate": 2.677946324387398e-05, "loss": 0.3542, "step": 987 }, { "epoch": 1.5559055118110237, "grad_norm": 0.33221516920957156, "learning_rate": 2.675029171528588e-05, "loss": 0.3812, "step": 988 }, { "epoch": 1.55748031496063, "grad_norm": 0.24840089251701752, "learning_rate": 2.6721120186697783e-05, "loss": 0.3591, "step": 989 }, { "epoch": 1.5590551181102361, "grad_norm": 0.3034742394934861, "learning_rate": 2.6691948658109688e-05, "loss": 0.3587, "step": 990 }, { "epoch": 1.5606299212598427, "grad_norm": 0.3216893643176448, "learning_rate": 2.666277712952159e-05, "loss": 0.3423, "step": 991 }, { "epoch": 1.5622047244094488, "grad_norm": 0.2492696142213711, "learning_rate": 2.6633605600933493e-05, "loss": 0.3463, "step": 992 }, { "epoch": 1.5637795275590551, "grad_norm": 0.27347483234063674, "learning_rate": 2.660443407234539e-05, "loss": 0.3602, "step": 993 }, { "epoch": 1.5653543307086615, "grad_norm": 0.287923944928929, "learning_rate": 2.6575262543757295e-05, "loss": 0.3629, "step": 994 }, { "epoch": 1.5669291338582676, "grad_norm": 0.28458914162373966, "learning_rate": 2.6546091015169193e-05, "loss": 0.3777, "step": 995 }, { "epoch": 1.5685039370078742, "grad_norm": 0.3016300751260411, "learning_rate": 2.6516919486581098e-05, "loss": 0.3434, "step": 996 }, { "epoch": 1.5700787401574803, "grad_norm": 0.26457536591880293, "learning_rate": 2.6487747957993e-05, "loss": 0.3592, "step": 997 }, { "epoch": 1.5716535433070866, "grad_norm": 0.26898540097429263, "learning_rate": 2.6458576429404903e-05, "loss": 0.3481, "step": 998 }, { "epoch": 1.573228346456693, "grad_norm": 0.27309572753349975, "learning_rate": 2.6429404900816808e-05, "loss": 0.3709, "step": 999 }, { "epoch": 1.574803149606299, "grad_norm": 0.3395046751202685, "learning_rate": 2.6400233372228705e-05, "loss": 0.3729, "step": 1000 }, { "epoch": 1.5763779527559056, "grad_norm": 0.340877937037139, "learning_rate": 2.637106184364061e-05, "loss": 0.3457, "step": 1001 }, { "epoch": 1.5779527559055118, "grad_norm": 0.299962139228569, "learning_rate": 2.6341890315052507e-05, "loss": 0.3772, "step": 1002 }, { "epoch": 1.579527559055118, "grad_norm": 0.2961369378882837, "learning_rate": 2.6312718786464412e-05, "loss": 0.3652, "step": 1003 }, { "epoch": 1.5811023622047244, "grad_norm": 0.2905190649537378, "learning_rate": 2.6283547257876313e-05, "loss": 0.3705, "step": 1004 }, { "epoch": 1.5826771653543306, "grad_norm": 0.35268516734000777, "learning_rate": 2.6254375729288217e-05, "loss": 0.358, "step": 1005 }, { "epoch": 1.5842519685039371, "grad_norm": 0.25090260807739384, "learning_rate": 2.6225204200700115e-05, "loss": 0.3663, "step": 1006 }, { "epoch": 1.5858267716535432, "grad_norm": 0.3005324798232957, "learning_rate": 2.619603267211202e-05, "loss": 0.3532, "step": 1007 }, { "epoch": 1.5874015748031496, "grad_norm": 0.3217909147853101, "learning_rate": 2.6166861143523924e-05, "loss": 0.3631, "step": 1008 }, { "epoch": 1.588976377952756, "grad_norm": 0.2756094162393922, "learning_rate": 2.6137689614935822e-05, "loss": 0.3563, "step": 1009 }, { "epoch": 1.590551181102362, "grad_norm": 0.36314644680027136, "learning_rate": 2.6108518086347726e-05, "loss": 0.3606, "step": 1010 }, { "epoch": 1.5921259842519686, "grad_norm": 0.27222463544502146, "learning_rate": 2.6079346557759627e-05, "loss": 0.3622, "step": 1011 }, { "epoch": 1.5937007874015747, "grad_norm": 0.2771083110438106, "learning_rate": 2.6050175029171532e-05, "loss": 0.3594, "step": 1012 }, { "epoch": 1.595275590551181, "grad_norm": 0.30891236776895886, "learning_rate": 2.602100350058343e-05, "loss": 0.3698, "step": 1013 }, { "epoch": 1.5968503937007874, "grad_norm": 0.3298406573535251, "learning_rate": 2.5991831971995334e-05, "loss": 0.3645, "step": 1014 }, { "epoch": 1.5984251968503937, "grad_norm": 0.2812648739359526, "learning_rate": 2.596266044340724e-05, "loss": 0.3726, "step": 1015 }, { "epoch": 1.6, "grad_norm": 0.3554024668321782, "learning_rate": 2.5933488914819136e-05, "loss": 0.3756, "step": 1016 }, { "epoch": 1.6015748031496062, "grad_norm": 0.3222797211865368, "learning_rate": 2.590431738623104e-05, "loss": 0.3594, "step": 1017 }, { "epoch": 1.6031496062992125, "grad_norm": 0.3534672201855548, "learning_rate": 2.587514585764294e-05, "loss": 0.3747, "step": 1018 }, { "epoch": 1.604724409448819, "grad_norm": 0.3184393266809904, "learning_rate": 2.5845974329054846e-05, "loss": 0.3599, "step": 1019 }, { "epoch": 1.6062992125984252, "grad_norm": 0.3161015364108231, "learning_rate": 2.5816802800466744e-05, "loss": 0.3864, "step": 1020 }, { "epoch": 1.6078740157480316, "grad_norm": 0.3146167571811909, "learning_rate": 2.5787631271878648e-05, "loss": 0.3679, "step": 1021 }, { "epoch": 1.6094488188976377, "grad_norm": 0.34958409216897757, "learning_rate": 2.5758459743290546e-05, "loss": 0.3444, "step": 1022 }, { "epoch": 1.6110236220472443, "grad_norm": 0.27278224387873085, "learning_rate": 2.572928821470245e-05, "loss": 0.3604, "step": 1023 }, { "epoch": 1.6125984251968504, "grad_norm": 0.3143972213118866, "learning_rate": 2.5700116686114355e-05, "loss": 0.3664, "step": 1024 }, { "epoch": 1.6141732283464567, "grad_norm": 0.27509993985242825, "learning_rate": 2.5670945157526256e-05, "loss": 0.3602, "step": 1025 }, { "epoch": 1.615748031496063, "grad_norm": 0.2738856783640687, "learning_rate": 2.564177362893816e-05, "loss": 0.3462, "step": 1026 }, { "epoch": 1.6173228346456692, "grad_norm": 0.2933255391188308, "learning_rate": 2.5612602100350058e-05, "loss": 0.3584, "step": 1027 }, { "epoch": 1.6188976377952757, "grad_norm": 0.31744642441169457, "learning_rate": 2.5583430571761963e-05, "loss": 0.3511, "step": 1028 }, { "epoch": 1.6204724409448819, "grad_norm": 0.26933537784225214, "learning_rate": 2.555425904317386e-05, "loss": 0.3607, "step": 1029 }, { "epoch": 1.6220472440944882, "grad_norm": 0.3600351931105396, "learning_rate": 2.5525087514585765e-05, "loss": 0.3637, "step": 1030 }, { "epoch": 1.6236220472440945, "grad_norm": 0.2962561540992693, "learning_rate": 2.5495915985997666e-05, "loss": 0.3679, "step": 1031 }, { "epoch": 1.6251968503937007, "grad_norm": 0.2954695220281512, "learning_rate": 2.546674445740957e-05, "loss": 0.3761, "step": 1032 }, { "epoch": 1.6267716535433072, "grad_norm": 0.28263301024954113, "learning_rate": 2.5437572928821475e-05, "loss": 0.3664, "step": 1033 }, { "epoch": 1.6283464566929133, "grad_norm": 0.33796823659640823, "learning_rate": 2.5408401400233373e-05, "loss": 0.3537, "step": 1034 }, { "epoch": 1.6299212598425197, "grad_norm": 0.2953958974828149, "learning_rate": 2.5379229871645277e-05, "loss": 0.3561, "step": 1035 }, { "epoch": 1.631496062992126, "grad_norm": 0.31975696900644957, "learning_rate": 2.5350058343057175e-05, "loss": 0.3596, "step": 1036 }, { "epoch": 1.6330708661417321, "grad_norm": 0.2809937332415953, "learning_rate": 2.532088681446908e-05, "loss": 0.3667, "step": 1037 }, { "epoch": 1.6346456692913387, "grad_norm": 0.33007877745374997, "learning_rate": 2.529171528588098e-05, "loss": 0.3632, "step": 1038 }, { "epoch": 1.6362204724409448, "grad_norm": 0.3511858260212273, "learning_rate": 2.5262543757292885e-05, "loss": 0.3733, "step": 1039 }, { "epoch": 1.6377952755905512, "grad_norm": 0.2908814459908971, "learning_rate": 2.523337222870479e-05, "loss": 0.354, "step": 1040 }, { "epoch": 1.6393700787401575, "grad_norm": 0.30310859949838803, "learning_rate": 2.5204200700116687e-05, "loss": 0.3412, "step": 1041 }, { "epoch": 1.6409448818897636, "grad_norm": 0.2640525252211153, "learning_rate": 2.517502917152859e-05, "loss": 0.3392, "step": 1042 }, { "epoch": 1.6425196850393702, "grad_norm": 0.2965837093719704, "learning_rate": 2.514585764294049e-05, "loss": 0.3493, "step": 1043 }, { "epoch": 1.6440944881889763, "grad_norm": 0.28038316409071473, "learning_rate": 2.5116686114352393e-05, "loss": 0.3631, "step": 1044 }, { "epoch": 1.6456692913385826, "grad_norm": 0.26580714438427133, "learning_rate": 2.5087514585764295e-05, "loss": 0.3461, "step": 1045 }, { "epoch": 1.647244094488189, "grad_norm": 0.3114900674869988, "learning_rate": 2.50583430571762e-05, "loss": 0.37, "step": 1046 }, { "epoch": 1.6488188976377953, "grad_norm": 0.31620112931731603, "learning_rate": 2.5029171528588097e-05, "loss": 0.3782, "step": 1047 }, { "epoch": 1.6503937007874017, "grad_norm": 0.2908409996058806, "learning_rate": 2.5e-05, "loss": 0.3777, "step": 1048 }, { "epoch": 1.6519685039370078, "grad_norm": 0.27585819647895293, "learning_rate": 2.4970828471411902e-05, "loss": 0.3606, "step": 1049 }, { "epoch": 1.6535433070866141, "grad_norm": 0.289696020726859, "learning_rate": 2.4941656942823803e-05, "loss": 0.3364, "step": 1050 }, { "epoch": 1.6551181102362205, "grad_norm": 0.30948866218198406, "learning_rate": 2.4912485414235708e-05, "loss": 0.3655, "step": 1051 }, { "epoch": 1.6566929133858268, "grad_norm": 0.3020847983392082, "learning_rate": 2.488331388564761e-05, "loss": 0.3486, "step": 1052 }, { "epoch": 1.6582677165354331, "grad_norm": 0.27495560174516603, "learning_rate": 2.4854142357059513e-05, "loss": 0.3522, "step": 1053 }, { "epoch": 1.6598425196850393, "grad_norm": 0.3080281420023759, "learning_rate": 2.4824970828471414e-05, "loss": 0.3676, "step": 1054 }, { "epoch": 1.6614173228346458, "grad_norm": 0.28353966606457637, "learning_rate": 2.4795799299883316e-05, "loss": 0.3386, "step": 1055 }, { "epoch": 1.662992125984252, "grad_norm": 0.3074255286817048, "learning_rate": 2.4766627771295217e-05, "loss": 0.351, "step": 1056 }, { "epoch": 1.6645669291338583, "grad_norm": 0.34514808004274605, "learning_rate": 2.4737456242707118e-05, "loss": 0.3612, "step": 1057 }, { "epoch": 1.6661417322834646, "grad_norm": 0.4056243476688342, "learning_rate": 2.470828471411902e-05, "loss": 0.3787, "step": 1058 }, { "epoch": 1.6677165354330707, "grad_norm": 0.28978214511120676, "learning_rate": 2.4679113185530923e-05, "loss": 0.3863, "step": 1059 }, { "epoch": 1.6692913385826773, "grad_norm": 0.344056123008185, "learning_rate": 2.4649941656942824e-05, "loss": 0.3771, "step": 1060 }, { "epoch": 1.6708661417322834, "grad_norm": 0.37456162672412585, "learning_rate": 2.462077012835473e-05, "loss": 0.3604, "step": 1061 }, { "epoch": 1.6724409448818898, "grad_norm": 0.2757293746368924, "learning_rate": 2.459159859976663e-05, "loss": 0.3508, "step": 1062 }, { "epoch": 1.674015748031496, "grad_norm": 0.2717004784961013, "learning_rate": 2.456242707117853e-05, "loss": 0.3557, "step": 1063 }, { "epoch": 1.6755905511811022, "grad_norm": 0.32497557250560594, "learning_rate": 2.4533255542590432e-05, "loss": 0.3705, "step": 1064 }, { "epoch": 1.6771653543307088, "grad_norm": 0.33229556447056763, "learning_rate": 2.4504084014002333e-05, "loss": 0.373, "step": 1065 }, { "epoch": 1.678740157480315, "grad_norm": 0.2836462996779695, "learning_rate": 2.4474912485414238e-05, "loss": 0.3779, "step": 1066 }, { "epoch": 1.6803149606299213, "grad_norm": 0.26339583107354314, "learning_rate": 2.444574095682614e-05, "loss": 0.3439, "step": 1067 }, { "epoch": 1.6818897637795276, "grad_norm": 0.42077261425697976, "learning_rate": 2.441656942823804e-05, "loss": 0.3608, "step": 1068 }, { "epoch": 1.6834645669291337, "grad_norm": 0.28132091764300093, "learning_rate": 2.4387397899649944e-05, "loss": 0.3625, "step": 1069 }, { "epoch": 1.6850393700787403, "grad_norm": 0.27547253295112745, "learning_rate": 2.4358226371061845e-05, "loss": 0.3469, "step": 1070 }, { "epoch": 1.6866141732283464, "grad_norm": 0.3238820076008639, "learning_rate": 2.4329054842473746e-05, "loss": 0.344, "step": 1071 }, { "epoch": 1.6881889763779527, "grad_norm": 0.2993432753457173, "learning_rate": 2.4299883313885647e-05, "loss": 0.3559, "step": 1072 }, { "epoch": 1.689763779527559, "grad_norm": 0.3135174284878154, "learning_rate": 2.4270711785297552e-05, "loss": 0.3442, "step": 1073 }, { "epoch": 1.6913385826771652, "grad_norm": 0.34324717966537965, "learning_rate": 2.4241540256709453e-05, "loss": 0.3605, "step": 1074 }, { "epoch": 1.6929133858267718, "grad_norm": 0.33215793576825997, "learning_rate": 2.4212368728121354e-05, "loss": 0.3512, "step": 1075 }, { "epoch": 1.6944881889763779, "grad_norm": 0.31001683795506346, "learning_rate": 2.4183197199533255e-05, "loss": 0.3706, "step": 1076 }, { "epoch": 1.6960629921259842, "grad_norm": 0.365059607859085, "learning_rate": 2.4154025670945156e-05, "loss": 0.342, "step": 1077 }, { "epoch": 1.6976377952755906, "grad_norm": 0.37216980786643056, "learning_rate": 2.412485414235706e-05, "loss": 0.3408, "step": 1078 }, { "epoch": 1.699212598425197, "grad_norm": 0.2571811621499541, "learning_rate": 2.4095682613768962e-05, "loss": 0.3581, "step": 1079 }, { "epoch": 1.7007874015748032, "grad_norm": 0.2994141168613742, "learning_rate": 2.4066511085180866e-05, "loss": 0.3498, "step": 1080 }, { "epoch": 1.7023622047244094, "grad_norm": 0.3552093661917371, "learning_rate": 2.4037339556592767e-05, "loss": 0.362, "step": 1081 }, { "epoch": 1.7039370078740157, "grad_norm": 0.3557141985472291, "learning_rate": 2.400816802800467e-05, "loss": 0.3692, "step": 1082 }, { "epoch": 1.705511811023622, "grad_norm": 0.33253704269448997, "learning_rate": 2.397899649941657e-05, "loss": 0.3746, "step": 1083 }, { "epoch": 1.7070866141732284, "grad_norm": 0.3198198906764904, "learning_rate": 2.394982497082847e-05, "loss": 0.3465, "step": 1084 }, { "epoch": 1.7086614173228347, "grad_norm": 0.3866280992455605, "learning_rate": 2.392065344224037e-05, "loss": 0.3681, "step": 1085 }, { "epoch": 1.7102362204724408, "grad_norm": 0.28695095769700785, "learning_rate": 2.3891481913652276e-05, "loss": 0.3577, "step": 1086 }, { "epoch": 1.7118110236220474, "grad_norm": 0.3727414253712588, "learning_rate": 2.386231038506418e-05, "loss": 0.3573, "step": 1087 }, { "epoch": 1.7133858267716535, "grad_norm": 0.29719780086702335, "learning_rate": 2.3833138856476082e-05, "loss": 0.3354, "step": 1088 }, { "epoch": 1.7149606299212599, "grad_norm": 0.3192153993712115, "learning_rate": 2.3803967327887983e-05, "loss": 0.3868, "step": 1089 }, { "epoch": 1.7165354330708662, "grad_norm": 0.31201357534978796, "learning_rate": 2.3774795799299884e-05, "loss": 0.3506, "step": 1090 }, { "epoch": 1.7181102362204723, "grad_norm": 0.3061445446526045, "learning_rate": 2.3745624270711785e-05, "loss": 0.36, "step": 1091 }, { "epoch": 1.7196850393700789, "grad_norm": 0.3300656011459368, "learning_rate": 2.3716452742123686e-05, "loss": 0.3664, "step": 1092 }, { "epoch": 1.721259842519685, "grad_norm": 0.2737547716809561, "learning_rate": 2.368728121353559e-05, "loss": 0.3471, "step": 1093 }, { "epoch": 1.7228346456692913, "grad_norm": 0.33789741058074113, "learning_rate": 2.3658109684947495e-05, "loss": 0.3586, "step": 1094 }, { "epoch": 1.7244094488188977, "grad_norm": 0.26617684918878465, "learning_rate": 2.3628938156359396e-05, "loss": 0.3402, "step": 1095 }, { "epoch": 1.7259842519685038, "grad_norm": 0.26861032964949383, "learning_rate": 2.3599766627771297e-05, "loss": 0.3615, "step": 1096 }, { "epoch": 1.7275590551181104, "grad_norm": 0.27329950108488815, "learning_rate": 2.3570595099183198e-05, "loss": 0.374, "step": 1097 }, { "epoch": 1.7291338582677165, "grad_norm": 0.3180531165880881, "learning_rate": 2.35414235705951e-05, "loss": 0.3532, "step": 1098 }, { "epoch": 1.7307086614173228, "grad_norm": 0.2696363396663831, "learning_rate": 2.3512252042007e-05, "loss": 0.3568, "step": 1099 }, { "epoch": 1.7322834645669292, "grad_norm": 0.31373799901927435, "learning_rate": 2.3483080513418905e-05, "loss": 0.3427, "step": 1100 }, { "epoch": 1.7338582677165353, "grad_norm": 0.3211680158306515, "learning_rate": 2.3453908984830806e-05, "loss": 0.37, "step": 1101 }, { "epoch": 1.7354330708661418, "grad_norm": 0.28891281748685654, "learning_rate": 2.3424737456242707e-05, "loss": 0.3591, "step": 1102 }, { "epoch": 1.737007874015748, "grad_norm": 0.32709752373228707, "learning_rate": 2.339556592765461e-05, "loss": 0.3709, "step": 1103 }, { "epoch": 1.7385826771653543, "grad_norm": 0.285966042054201, "learning_rate": 2.3366394399066513e-05, "loss": 0.3684, "step": 1104 }, { "epoch": 1.7401574803149606, "grad_norm": 0.3022418939653369, "learning_rate": 2.3337222870478414e-05, "loss": 0.3514, "step": 1105 }, { "epoch": 1.7417322834645668, "grad_norm": 0.31570715954633033, "learning_rate": 2.3308051341890315e-05, "loss": 0.3697, "step": 1106 }, { "epoch": 1.7433070866141733, "grad_norm": 0.29511264041279445, "learning_rate": 2.327887981330222e-05, "loss": 0.364, "step": 1107 }, { "epoch": 1.7448818897637794, "grad_norm": 0.3146765894926815, "learning_rate": 2.324970828471412e-05, "loss": 0.3583, "step": 1108 }, { "epoch": 1.7464566929133858, "grad_norm": 0.3589786065813507, "learning_rate": 2.322053675612602e-05, "loss": 0.3552, "step": 1109 }, { "epoch": 1.7480314960629921, "grad_norm": 0.32324577537385063, "learning_rate": 2.3191365227537922e-05, "loss": 0.3563, "step": 1110 }, { "epoch": 1.7496062992125985, "grad_norm": 0.3350545271272399, "learning_rate": 2.3162193698949827e-05, "loss": 0.3819, "step": 1111 }, { "epoch": 1.7511811023622048, "grad_norm": 0.3349965774495849, "learning_rate": 2.3133022170361728e-05, "loss": 0.3496, "step": 1112 }, { "epoch": 1.752755905511811, "grad_norm": 0.27685755027644393, "learning_rate": 2.310385064177363e-05, "loss": 0.3515, "step": 1113 }, { "epoch": 1.7543307086614173, "grad_norm": 0.32776196257042217, "learning_rate": 2.3074679113185534e-05, "loss": 0.367, "step": 1114 }, { "epoch": 1.7559055118110236, "grad_norm": 0.2948332017574089, "learning_rate": 2.3045507584597435e-05, "loss": 0.3633, "step": 1115 }, { "epoch": 1.75748031496063, "grad_norm": 0.2525170239249286, "learning_rate": 2.3016336056009336e-05, "loss": 0.3648, "step": 1116 }, { "epoch": 1.7590551181102363, "grad_norm": 0.28674275324267784, "learning_rate": 2.2987164527421237e-05, "loss": 0.3681, "step": 1117 }, { "epoch": 1.7606299212598424, "grad_norm": 0.2816904412307423, "learning_rate": 2.2957992998833138e-05, "loss": 0.3625, "step": 1118 }, { "epoch": 1.762204724409449, "grad_norm": 0.2860643904925843, "learning_rate": 2.2928821470245042e-05, "loss": 0.3483, "step": 1119 }, { "epoch": 1.763779527559055, "grad_norm": 0.28269657190108993, "learning_rate": 2.2899649941656943e-05, "loss": 0.3812, "step": 1120 }, { "epoch": 1.7653543307086614, "grad_norm": 0.34310103945245196, "learning_rate": 2.2870478413068848e-05, "loss": 0.3614, "step": 1121 }, { "epoch": 1.7669291338582678, "grad_norm": 0.2633512116945133, "learning_rate": 2.284130688448075e-05, "loss": 0.3594, "step": 1122 }, { "epoch": 1.768503937007874, "grad_norm": 0.2542595828251316, "learning_rate": 2.281213535589265e-05, "loss": 0.3366, "step": 1123 }, { "epoch": 1.7700787401574805, "grad_norm": 0.2708311801748683, "learning_rate": 2.278296382730455e-05, "loss": 0.3482, "step": 1124 }, { "epoch": 1.7716535433070866, "grad_norm": 0.2805600904612669, "learning_rate": 2.2753792298716452e-05, "loss": 0.3634, "step": 1125 }, { "epoch": 1.773228346456693, "grad_norm": 0.30400642974173814, "learning_rate": 2.2724620770128353e-05, "loss": 0.3657, "step": 1126 }, { "epoch": 1.7748031496062993, "grad_norm": 0.2766311319065976, "learning_rate": 2.2695449241540258e-05, "loss": 0.3365, "step": 1127 }, { "epoch": 1.7763779527559054, "grad_norm": 0.2943927144926876, "learning_rate": 2.2666277712952162e-05, "loss": 0.3424, "step": 1128 }, { "epoch": 1.777952755905512, "grad_norm": 0.3934949457186155, "learning_rate": 2.2637106184364063e-05, "loss": 0.3653, "step": 1129 }, { "epoch": 1.779527559055118, "grad_norm": 0.2502711435999886, "learning_rate": 2.2607934655775964e-05, "loss": 0.3809, "step": 1130 }, { "epoch": 1.7811023622047244, "grad_norm": 0.3448551185020847, "learning_rate": 2.2578763127187865e-05, "loss": 0.3406, "step": 1131 }, { "epoch": 1.7826771653543307, "grad_norm": 0.2778622616780487, "learning_rate": 2.2549591598599767e-05, "loss": 0.3709, "step": 1132 }, { "epoch": 1.7842519685039369, "grad_norm": 0.27186867009513466, "learning_rate": 2.2520420070011668e-05, "loss": 0.351, "step": 1133 }, { "epoch": 1.7858267716535434, "grad_norm": 0.26091408051117015, "learning_rate": 2.2491248541423572e-05, "loss": 0.3627, "step": 1134 }, { "epoch": 1.7874015748031495, "grad_norm": 0.31865748084224993, "learning_rate": 2.2462077012835473e-05, "loss": 0.3841, "step": 1135 }, { "epoch": 1.7889763779527559, "grad_norm": 0.2930143366991859, "learning_rate": 2.2432905484247378e-05, "loss": 0.3606, "step": 1136 }, { "epoch": 1.7905511811023622, "grad_norm": 0.2505720076569231, "learning_rate": 2.240373395565928e-05, "loss": 0.36, "step": 1137 }, { "epoch": 1.7921259842519683, "grad_norm": 0.2932018953594446, "learning_rate": 2.237456242707118e-05, "loss": 0.3537, "step": 1138 }, { "epoch": 1.793700787401575, "grad_norm": 0.26287603922313396, "learning_rate": 2.234539089848308e-05, "loss": 0.3476, "step": 1139 }, { "epoch": 1.795275590551181, "grad_norm": 0.2755927075968409, "learning_rate": 2.2316219369894982e-05, "loss": 0.3678, "step": 1140 }, { "epoch": 1.7968503937007874, "grad_norm": 0.2893547687725689, "learning_rate": 2.2287047841306886e-05, "loss": 0.3583, "step": 1141 }, { "epoch": 1.7984251968503937, "grad_norm": 0.2629723359710728, "learning_rate": 2.2257876312718788e-05, "loss": 0.3588, "step": 1142 }, { "epoch": 1.8, "grad_norm": 0.2766836591969898, "learning_rate": 2.222870478413069e-05, "loss": 0.356, "step": 1143 }, { "epoch": 1.8015748031496064, "grad_norm": 0.2693294822198235, "learning_rate": 2.2199533255542593e-05, "loss": 0.3562, "step": 1144 }, { "epoch": 1.8031496062992125, "grad_norm": 0.2710767260950884, "learning_rate": 2.2170361726954494e-05, "loss": 0.3527, "step": 1145 }, { "epoch": 1.8047244094488188, "grad_norm": 0.2716429381612106, "learning_rate": 2.2141190198366395e-05, "loss": 0.3715, "step": 1146 }, { "epoch": 1.8062992125984252, "grad_norm": 0.29903174077650385, "learning_rate": 2.2112018669778296e-05, "loss": 0.3514, "step": 1147 }, { "epoch": 1.8078740157480315, "grad_norm": 0.4633774719081259, "learning_rate": 2.20828471411902e-05, "loss": 0.3606, "step": 1148 }, { "epoch": 1.8094488188976379, "grad_norm": 0.3033457484939459, "learning_rate": 2.2053675612602102e-05, "loss": 0.3624, "step": 1149 }, { "epoch": 1.811023622047244, "grad_norm": 0.3141425548211974, "learning_rate": 2.2024504084014003e-05, "loss": 0.3699, "step": 1150 }, { "epoch": 1.8125984251968505, "grad_norm": 0.30461824889993083, "learning_rate": 2.1995332555425904e-05, "loss": 0.3766, "step": 1151 }, { "epoch": 1.8141732283464567, "grad_norm": 0.28321728461278745, "learning_rate": 2.1966161026837805e-05, "loss": 0.3462, "step": 1152 }, { "epoch": 1.815748031496063, "grad_norm": 0.310496720001963, "learning_rate": 2.193698949824971e-05, "loss": 0.3604, "step": 1153 }, { "epoch": 1.8173228346456693, "grad_norm": 0.2798900245424523, "learning_rate": 2.190781796966161e-05, "loss": 0.3459, "step": 1154 }, { "epoch": 1.8188976377952755, "grad_norm": 0.27212318641467764, "learning_rate": 2.1878646441073515e-05, "loss": 0.3458, "step": 1155 }, { "epoch": 1.820472440944882, "grad_norm": 0.2512187299815126, "learning_rate": 2.1849474912485416e-05, "loss": 0.3506, "step": 1156 }, { "epoch": 1.8220472440944881, "grad_norm": 0.280408953408338, "learning_rate": 2.1820303383897317e-05, "loss": 0.36, "step": 1157 }, { "epoch": 1.8236220472440945, "grad_norm": 0.28892513307162937, "learning_rate": 2.179113185530922e-05, "loss": 0.3727, "step": 1158 }, { "epoch": 1.8251968503937008, "grad_norm": 0.2729556622516543, "learning_rate": 2.176196032672112e-05, "loss": 0.3466, "step": 1159 }, { "epoch": 1.826771653543307, "grad_norm": 0.29627727863964604, "learning_rate": 2.173278879813302e-05, "loss": 0.3577, "step": 1160 }, { "epoch": 1.8283464566929135, "grad_norm": 0.27426005191317154, "learning_rate": 2.1703617269544925e-05, "loss": 0.3535, "step": 1161 }, { "epoch": 1.8299212598425196, "grad_norm": 0.2933857786735975, "learning_rate": 2.167444574095683e-05, "loss": 0.3591, "step": 1162 }, { "epoch": 1.831496062992126, "grad_norm": 0.27663628119628286, "learning_rate": 2.164527421236873e-05, "loss": 0.3518, "step": 1163 }, { "epoch": 1.8330708661417323, "grad_norm": 0.2600352217414414, "learning_rate": 2.161610268378063e-05, "loss": 0.3591, "step": 1164 }, { "epoch": 1.8346456692913384, "grad_norm": 0.2715545147696148, "learning_rate": 2.1586931155192533e-05, "loss": 0.3487, "step": 1165 }, { "epoch": 1.836220472440945, "grad_norm": 0.26258199123652554, "learning_rate": 2.1557759626604434e-05, "loss": 0.357, "step": 1166 }, { "epoch": 1.8377952755905511, "grad_norm": 0.25052581460297335, "learning_rate": 2.1528588098016335e-05, "loss": 0.3777, "step": 1167 }, { "epoch": 1.8393700787401575, "grad_norm": 0.2906210223537665, "learning_rate": 2.149941656942824e-05, "loss": 0.3801, "step": 1168 }, { "epoch": 1.8409448818897638, "grad_norm": 0.31786981405054565, "learning_rate": 2.1470245040840144e-05, "loss": 0.3671, "step": 1169 }, { "epoch": 1.84251968503937, "grad_norm": 0.6456548856528942, "learning_rate": 2.1441073512252045e-05, "loss": 0.3588, "step": 1170 }, { "epoch": 1.8440944881889765, "grad_norm": 0.3143158897749165, "learning_rate": 2.1411901983663946e-05, "loss": 0.3786, "step": 1171 }, { "epoch": 1.8456692913385826, "grad_norm": 0.37250503640054655, "learning_rate": 2.1382730455075847e-05, "loss": 0.3674, "step": 1172 }, { "epoch": 1.847244094488189, "grad_norm": 0.27383168569175625, "learning_rate": 2.1353558926487748e-05, "loss": 0.3454, "step": 1173 }, { "epoch": 1.8488188976377953, "grad_norm": 0.3046609432264204, "learning_rate": 2.132438739789965e-05, "loss": 0.3487, "step": 1174 }, { "epoch": 1.8503937007874016, "grad_norm": 0.2804848649179638, "learning_rate": 2.1295215869311554e-05, "loss": 0.3576, "step": 1175 }, { "epoch": 1.851968503937008, "grad_norm": 0.31479120011240164, "learning_rate": 2.1266044340723455e-05, "loss": 0.3774, "step": 1176 }, { "epoch": 1.853543307086614, "grad_norm": 0.2610680429239558, "learning_rate": 2.123687281213536e-05, "loss": 0.3586, "step": 1177 }, { "epoch": 1.8551181102362204, "grad_norm": 0.24882063017326872, "learning_rate": 2.120770128354726e-05, "loss": 0.3518, "step": 1178 }, { "epoch": 1.8566929133858268, "grad_norm": 0.2790797766837998, "learning_rate": 2.117852975495916e-05, "loss": 0.3842, "step": 1179 }, { "epoch": 1.858267716535433, "grad_norm": 0.2573815359870439, "learning_rate": 2.1149358226371062e-05, "loss": 0.3564, "step": 1180 }, { "epoch": 1.8598425196850394, "grad_norm": 0.3243613234743167, "learning_rate": 2.1120186697782964e-05, "loss": 0.3816, "step": 1181 }, { "epoch": 1.8614173228346456, "grad_norm": 0.2655386062201565, "learning_rate": 2.1091015169194868e-05, "loss": 0.354, "step": 1182 }, { "epoch": 1.8629921259842521, "grad_norm": 0.2740963939067814, "learning_rate": 2.106184364060677e-05, "loss": 0.3656, "step": 1183 }, { "epoch": 1.8645669291338582, "grad_norm": 0.2843498651753472, "learning_rate": 2.103267211201867e-05, "loss": 0.3693, "step": 1184 }, { "epoch": 1.8661417322834646, "grad_norm": 0.22977073466545048, "learning_rate": 2.100350058343057e-05, "loss": 0.3613, "step": 1185 }, { "epoch": 1.867716535433071, "grad_norm": 0.2258303054677578, "learning_rate": 2.0974329054842476e-05, "loss": 0.3486, "step": 1186 }, { "epoch": 1.869291338582677, "grad_norm": 0.3199488237390706, "learning_rate": 2.0945157526254377e-05, "loss": 0.354, "step": 1187 }, { "epoch": 1.8708661417322836, "grad_norm": 0.28056094646695073, "learning_rate": 2.0915985997666278e-05, "loss": 0.3652, "step": 1188 }, { "epoch": 1.8724409448818897, "grad_norm": 0.27727322297434054, "learning_rate": 2.0886814469078182e-05, "loss": 0.3467, "step": 1189 }, { "epoch": 1.874015748031496, "grad_norm": 0.3212931887828484, "learning_rate": 2.0857642940490083e-05, "loss": 0.3694, "step": 1190 }, { "epoch": 1.8755905511811024, "grad_norm": 0.25682290733656993, "learning_rate": 2.0828471411901985e-05, "loss": 0.369, "step": 1191 }, { "epoch": 1.8771653543307085, "grad_norm": 0.2711768733269575, "learning_rate": 2.0799299883313886e-05, "loss": 0.3649, "step": 1192 }, { "epoch": 1.878740157480315, "grad_norm": 0.28072418917114983, "learning_rate": 2.0770128354725787e-05, "loss": 0.3504, "step": 1193 }, { "epoch": 1.8803149606299212, "grad_norm": 0.25397356192410675, "learning_rate": 2.074095682613769e-05, "loss": 0.3644, "step": 1194 }, { "epoch": 1.8818897637795275, "grad_norm": 0.3287388677139133, "learning_rate": 2.0711785297549592e-05, "loss": 0.3527, "step": 1195 }, { "epoch": 1.8834645669291339, "grad_norm": 0.33404014115596214, "learning_rate": 2.0682613768961497e-05, "loss": 0.3691, "step": 1196 }, { "epoch": 1.88503937007874, "grad_norm": 0.2891705434985061, "learning_rate": 2.0653442240373398e-05, "loss": 0.348, "step": 1197 }, { "epoch": 1.8866141732283466, "grad_norm": 0.32641210869017834, "learning_rate": 2.06242707117853e-05, "loss": 0.3598, "step": 1198 }, { "epoch": 1.8881889763779527, "grad_norm": 0.29895184341957237, "learning_rate": 2.05950991831972e-05, "loss": 0.3598, "step": 1199 }, { "epoch": 1.889763779527559, "grad_norm": 0.2948824614589674, "learning_rate": 2.05659276546091e-05, "loss": 0.3683, "step": 1200 }, { "epoch": 1.8913385826771654, "grad_norm": 0.6500908297443422, "learning_rate": 2.0536756126021002e-05, "loss": 0.3829, "step": 1201 }, { "epoch": 1.8929133858267715, "grad_norm": 0.3361829307280847, "learning_rate": 2.0507584597432907e-05, "loss": 0.3749, "step": 1202 }, { "epoch": 1.894488188976378, "grad_norm": 0.33206084987704426, "learning_rate": 2.047841306884481e-05, "loss": 0.3553, "step": 1203 }, { "epoch": 1.8960629921259842, "grad_norm": 0.31155373414783866, "learning_rate": 2.0449241540256712e-05, "loss": 0.337, "step": 1204 }, { "epoch": 1.8976377952755905, "grad_norm": 0.3607156942176888, "learning_rate": 2.0420070011668613e-05, "loss": 0.3448, "step": 1205 }, { "epoch": 1.8992125984251969, "grad_norm": 0.2850398075005334, "learning_rate": 2.0390898483080514e-05, "loss": 0.3531, "step": 1206 }, { "epoch": 1.9007874015748032, "grad_norm": 0.3335823777797334, "learning_rate": 2.0361726954492415e-05, "loss": 0.3337, "step": 1207 }, { "epoch": 1.9023622047244095, "grad_norm": 0.3024128147983549, "learning_rate": 2.0332555425904316e-05, "loss": 0.3513, "step": 1208 }, { "epoch": 1.9039370078740157, "grad_norm": 0.31266702984600436, "learning_rate": 2.030338389731622e-05, "loss": 0.355, "step": 1209 }, { "epoch": 1.905511811023622, "grad_norm": 0.30292242016691934, "learning_rate": 2.0274212368728122e-05, "loss": 0.3414, "step": 1210 }, { "epoch": 1.9070866141732283, "grad_norm": 0.9584789305854943, "learning_rate": 2.0245040840140026e-05, "loss": 0.3731, "step": 1211 }, { "epoch": 1.9086614173228347, "grad_norm": 0.3302730516884143, "learning_rate": 2.0215869311551928e-05, "loss": 0.3617, "step": 1212 }, { "epoch": 1.910236220472441, "grad_norm": 0.2745471936616637, "learning_rate": 2.018669778296383e-05, "loss": 0.3747, "step": 1213 }, { "epoch": 1.9118110236220471, "grad_norm": 0.29054363662010063, "learning_rate": 2.015752625437573e-05, "loss": 0.3519, "step": 1214 }, { "epoch": 1.9133858267716537, "grad_norm": 0.29855731661297275, "learning_rate": 2.012835472578763e-05, "loss": 0.3458, "step": 1215 }, { "epoch": 1.9149606299212598, "grad_norm": 0.2661183561768449, "learning_rate": 2.0099183197199535e-05, "loss": 0.3683, "step": 1216 }, { "epoch": 1.9165354330708662, "grad_norm": 0.29598870893733864, "learning_rate": 2.0070011668611436e-05, "loss": 0.3427, "step": 1217 }, { "epoch": 1.9181102362204725, "grad_norm": 0.3212411251308691, "learning_rate": 2.0040840140023337e-05, "loss": 0.3711, "step": 1218 }, { "epoch": 1.9196850393700786, "grad_norm": 0.25029426727574794, "learning_rate": 2.0011668611435242e-05, "loss": 0.3501, "step": 1219 }, { "epoch": 1.9212598425196852, "grad_norm": 0.2574479661739233, "learning_rate": 1.9982497082847143e-05, "loss": 0.3429, "step": 1220 }, { "epoch": 1.9228346456692913, "grad_norm": 0.29869406325040776, "learning_rate": 1.9953325554259044e-05, "loss": 0.3803, "step": 1221 }, { "epoch": 1.9244094488188976, "grad_norm": 0.2928988816595529, "learning_rate": 1.9924154025670945e-05, "loss": 0.3592, "step": 1222 }, { "epoch": 1.925984251968504, "grad_norm": 0.26352123046053677, "learning_rate": 1.989498249708285e-05, "loss": 0.3587, "step": 1223 }, { "epoch": 1.92755905511811, "grad_norm": 0.25419797939028305, "learning_rate": 1.986581096849475e-05, "loss": 0.3571, "step": 1224 }, { "epoch": 1.9291338582677167, "grad_norm": 0.2944556236520331, "learning_rate": 1.9836639439906652e-05, "loss": 0.3525, "step": 1225 }, { "epoch": 1.9307086614173228, "grad_norm": 0.254633392835903, "learning_rate": 1.9807467911318553e-05, "loss": 0.3626, "step": 1226 }, { "epoch": 1.9322834645669291, "grad_norm": 0.3041895815525449, "learning_rate": 1.9778296382730457e-05, "loss": 0.3447, "step": 1227 }, { "epoch": 1.9338582677165355, "grad_norm": 0.3822088263518416, "learning_rate": 1.974912485414236e-05, "loss": 0.3715, "step": 1228 }, { "epoch": 1.9354330708661416, "grad_norm": 0.28899190031293714, "learning_rate": 1.971995332555426e-05, "loss": 0.3533, "step": 1229 }, { "epoch": 1.9370078740157481, "grad_norm": 0.39202582888653836, "learning_rate": 1.9690781796966164e-05, "loss": 0.3632, "step": 1230 }, { "epoch": 1.9385826771653543, "grad_norm": 0.2832518406957648, "learning_rate": 1.9661610268378065e-05, "loss": 0.3509, "step": 1231 }, { "epoch": 1.9401574803149606, "grad_norm": 0.3604581659745045, "learning_rate": 1.9632438739789966e-05, "loss": 0.3563, "step": 1232 }, { "epoch": 1.941732283464567, "grad_norm": 0.33390756303889824, "learning_rate": 1.9603267211201867e-05, "loss": 0.3758, "step": 1233 }, { "epoch": 1.943307086614173, "grad_norm": 0.28729009343655976, "learning_rate": 1.9574095682613768e-05, "loss": 0.3534, "step": 1234 }, { "epoch": 1.9448818897637796, "grad_norm": 0.3406625629174988, "learning_rate": 1.954492415402567e-05, "loss": 0.3493, "step": 1235 }, { "epoch": 1.9464566929133857, "grad_norm": 0.3104910738417762, "learning_rate": 1.9515752625437574e-05, "loss": 0.3432, "step": 1236 }, { "epoch": 1.948031496062992, "grad_norm": 0.3162298251802148, "learning_rate": 1.9486581096849478e-05, "loss": 0.3768, "step": 1237 }, { "epoch": 1.9496062992125984, "grad_norm": 0.349613987125828, "learning_rate": 1.945740956826138e-05, "loss": 0.3583, "step": 1238 }, { "epoch": 1.9511811023622048, "grad_norm": 0.2787478326104301, "learning_rate": 1.942823803967328e-05, "loss": 0.3628, "step": 1239 }, { "epoch": 1.952755905511811, "grad_norm": 0.2696544964264987, "learning_rate": 1.939906651108518e-05, "loss": 0.3643, "step": 1240 }, { "epoch": 1.9543307086614172, "grad_norm": 0.3493877673249194, "learning_rate": 1.9369894982497083e-05, "loss": 0.3697, "step": 1241 }, { "epoch": 1.9559055118110236, "grad_norm": 0.3030257600195658, "learning_rate": 1.9340723453908984e-05, "loss": 0.3673, "step": 1242 }, { "epoch": 1.95748031496063, "grad_norm": 0.2675450671881725, "learning_rate": 1.9311551925320888e-05, "loss": 0.345, "step": 1243 }, { "epoch": 1.9590551181102362, "grad_norm": 0.3237257487340978, "learning_rate": 1.9282380396732793e-05, "loss": 0.3502, "step": 1244 }, { "epoch": 1.9606299212598426, "grad_norm": 0.27857634183373037, "learning_rate": 1.9253208868144694e-05, "loss": 0.3518, "step": 1245 }, { "epoch": 1.9622047244094487, "grad_norm": 0.3057846764518471, "learning_rate": 1.9224037339556595e-05, "loss": 0.36, "step": 1246 }, { "epoch": 1.9637795275590553, "grad_norm": 0.2706118073224758, "learning_rate": 1.9194865810968496e-05, "loss": 0.3545, "step": 1247 }, { "epoch": 1.9653543307086614, "grad_norm": 0.2779546170782203, "learning_rate": 1.9165694282380397e-05, "loss": 0.3513, "step": 1248 }, { "epoch": 1.9669291338582677, "grad_norm": 0.3008733136246834, "learning_rate": 1.9136522753792298e-05, "loss": 0.3426, "step": 1249 }, { "epoch": 1.968503937007874, "grad_norm": 0.2706308561178664, "learning_rate": 1.9107351225204202e-05, "loss": 0.375, "step": 1250 }, { "epoch": 1.9700787401574802, "grad_norm": 0.27711069739240246, "learning_rate": 1.9078179696616104e-05, "loss": 0.3519, "step": 1251 }, { "epoch": 1.9716535433070868, "grad_norm": 0.3161511691407547, "learning_rate": 1.9049008168028008e-05, "loss": 0.3752, "step": 1252 }, { "epoch": 1.9732283464566929, "grad_norm": 0.30796063731515005, "learning_rate": 1.901983663943991e-05, "loss": 0.3899, "step": 1253 }, { "epoch": 1.9748031496062992, "grad_norm": 0.2765754697578041, "learning_rate": 1.899066511085181e-05, "loss": 0.346, "step": 1254 }, { "epoch": 1.9763779527559056, "grad_norm": 0.2563057787461007, "learning_rate": 1.896149358226371e-05, "loss": 0.362, "step": 1255 }, { "epoch": 1.9779527559055117, "grad_norm": 0.33323050545789584, "learning_rate": 1.8932322053675612e-05, "loss": 0.3483, "step": 1256 }, { "epoch": 1.9795275590551182, "grad_norm": 0.2518089033216471, "learning_rate": 1.8903150525087517e-05, "loss": 0.3469, "step": 1257 }, { "epoch": 1.9811023622047244, "grad_norm": 0.29229271830153225, "learning_rate": 1.8873978996499418e-05, "loss": 0.3661, "step": 1258 }, { "epoch": 1.9826771653543307, "grad_norm": 0.2616962466685596, "learning_rate": 1.884480746791132e-05, "loss": 0.348, "step": 1259 }, { "epoch": 1.984251968503937, "grad_norm": 0.2394958289909118, "learning_rate": 1.881563593932322e-05, "loss": 0.343, "step": 1260 }, { "epoch": 1.9858267716535432, "grad_norm": 0.25662433631908865, "learning_rate": 1.8786464410735125e-05, "loss": 0.3499, "step": 1261 }, { "epoch": 1.9874015748031497, "grad_norm": 0.280456942193158, "learning_rate": 1.8757292882147026e-05, "loss": 0.3494, "step": 1262 }, { "epoch": 1.9889763779527558, "grad_norm": 0.2338892717025886, "learning_rate": 1.8728121353558927e-05, "loss": 0.3469, "step": 1263 }, { "epoch": 1.9905511811023622, "grad_norm": 0.25034252354242054, "learning_rate": 1.8698949824970828e-05, "loss": 0.358, "step": 1264 }, { "epoch": 1.9921259842519685, "grad_norm": 0.27491942933067137, "learning_rate": 1.8669778296382732e-05, "loss": 0.3568, "step": 1265 }, { "epoch": 1.9937007874015746, "grad_norm": 0.2711922571242096, "learning_rate": 1.8640606767794633e-05, "loss": 0.3588, "step": 1266 }, { "epoch": 1.9952755905511812, "grad_norm": 0.24328530965315842, "learning_rate": 1.8611435239206534e-05, "loss": 0.3617, "step": 1267 }, { "epoch": 1.9968503937007873, "grad_norm": 0.26568699456528055, "learning_rate": 1.8582263710618436e-05, "loss": 0.348, "step": 1268 }, { "epoch": 1.9984251968503937, "grad_norm": 0.24657370377078203, "learning_rate": 1.855309218203034e-05, "loss": 0.3709, "step": 1269 }, { "epoch": 2.0, "grad_norm": 0.2662353107766164, "learning_rate": 1.852392065344224e-05, "loss": 0.3372, "step": 1270 }, { "epoch": 2.001574803149606, "grad_norm": 0.30348507705158984, "learning_rate": 1.8494749124854142e-05, "loss": 0.2979, "step": 1271 }, { "epoch": 2.0031496062992127, "grad_norm": 0.27938824006771934, "learning_rate": 1.8465577596266047e-05, "loss": 0.2914, "step": 1272 }, { "epoch": 2.004724409448819, "grad_norm": 0.29074181096528284, "learning_rate": 1.8436406067677948e-05, "loss": 0.2898, "step": 1273 }, { "epoch": 2.0062992125984254, "grad_norm": 0.2626193937121949, "learning_rate": 1.840723453908985e-05, "loss": 0.2654, "step": 1274 }, { "epoch": 2.0078740157480315, "grad_norm": 0.3082623502112499, "learning_rate": 1.837806301050175e-05, "loss": 0.2936, "step": 1275 }, { "epoch": 2.0094488188976376, "grad_norm": 0.2880656874870232, "learning_rate": 1.834889148191365e-05, "loss": 0.2781, "step": 1276 }, { "epoch": 2.011023622047244, "grad_norm": 0.3341363675614908, "learning_rate": 1.8319719953325555e-05, "loss": 0.29, "step": 1277 }, { "epoch": 2.0125984251968503, "grad_norm": 0.3280608489454879, "learning_rate": 1.8290548424737456e-05, "loss": 0.2985, "step": 1278 }, { "epoch": 2.014173228346457, "grad_norm": 0.3103759875137305, "learning_rate": 1.826137689614936e-05, "loss": 0.2802, "step": 1279 }, { "epoch": 2.015748031496063, "grad_norm": 0.3173240049516503, "learning_rate": 1.8232205367561262e-05, "loss": 0.2834, "step": 1280 }, { "epoch": 2.017322834645669, "grad_norm": 0.3254425023801191, "learning_rate": 1.8203033838973163e-05, "loss": 0.2674, "step": 1281 }, { "epoch": 2.0188976377952756, "grad_norm": 0.28826120610765504, "learning_rate": 1.8173862310385064e-05, "loss": 0.283, "step": 1282 }, { "epoch": 2.0204724409448818, "grad_norm": 0.282092023270021, "learning_rate": 1.8144690781796965e-05, "loss": 0.2892, "step": 1283 }, { "epoch": 2.0220472440944883, "grad_norm": 0.29637916681470594, "learning_rate": 1.8115519253208866e-05, "loss": 0.2697, "step": 1284 }, { "epoch": 2.0236220472440944, "grad_norm": 0.32547880582773886, "learning_rate": 1.808634772462077e-05, "loss": 0.2749, "step": 1285 }, { "epoch": 2.0251968503937006, "grad_norm": 0.2931538983678935, "learning_rate": 1.8057176196032675e-05, "loss": 0.2767, "step": 1286 }, { "epoch": 2.026771653543307, "grad_norm": 0.3040929343817873, "learning_rate": 1.8028004667444576e-05, "loss": 0.2884, "step": 1287 }, { "epoch": 2.0283464566929132, "grad_norm": 0.3058305106796849, "learning_rate": 1.7998833138856477e-05, "loss": 0.2668, "step": 1288 }, { "epoch": 2.02992125984252, "grad_norm": 0.26881233667537685, "learning_rate": 1.796966161026838e-05, "loss": 0.2817, "step": 1289 }, { "epoch": 2.031496062992126, "grad_norm": 0.2662742884783517, "learning_rate": 1.794049008168028e-05, "loss": 0.2813, "step": 1290 }, { "epoch": 2.0330708661417325, "grad_norm": 0.2520256488249296, "learning_rate": 1.791131855309218e-05, "loss": 0.2883, "step": 1291 }, { "epoch": 2.0346456692913386, "grad_norm": 0.257014577572332, "learning_rate": 1.7882147024504085e-05, "loss": 0.2859, "step": 1292 }, { "epoch": 2.0362204724409447, "grad_norm": 0.2698519163880076, "learning_rate": 1.7852975495915986e-05, "loss": 0.2832, "step": 1293 }, { "epoch": 2.0377952755905513, "grad_norm": 0.27446441263232507, "learning_rate": 1.782380396732789e-05, "loss": 0.2703, "step": 1294 }, { "epoch": 2.0393700787401574, "grad_norm": 0.2562034684432591, "learning_rate": 1.7794632438739792e-05, "loss": 0.2715, "step": 1295 }, { "epoch": 2.040944881889764, "grad_norm": 0.27688102082228444, "learning_rate": 1.7765460910151693e-05, "loss": 0.2817, "step": 1296 }, { "epoch": 2.04251968503937, "grad_norm": 0.25139992939258277, "learning_rate": 1.7736289381563594e-05, "loss": 0.2754, "step": 1297 }, { "epoch": 2.044094488188976, "grad_norm": 0.2530887806313908, "learning_rate": 1.7707117852975495e-05, "loss": 0.2633, "step": 1298 }, { "epoch": 2.0456692913385828, "grad_norm": 0.2415835070671148, "learning_rate": 1.76779463243874e-05, "loss": 0.2827, "step": 1299 }, { "epoch": 2.047244094488189, "grad_norm": 0.2340624523169479, "learning_rate": 1.76487747957993e-05, "loss": 0.272, "step": 1300 }, { "epoch": 2.0488188976377955, "grad_norm": 0.3040554485983662, "learning_rate": 1.76196032672112e-05, "loss": 0.2845, "step": 1301 }, { "epoch": 2.0503937007874016, "grad_norm": 0.2410041561211332, "learning_rate": 1.7590431738623106e-05, "loss": 0.2941, "step": 1302 }, { "epoch": 2.0519685039370077, "grad_norm": 0.2598693744713934, "learning_rate": 1.7561260210035007e-05, "loss": 0.2889, "step": 1303 }, { "epoch": 2.0535433070866143, "grad_norm": 0.2738063458576862, "learning_rate": 1.7532088681446908e-05, "loss": 0.2896, "step": 1304 }, { "epoch": 2.0551181102362204, "grad_norm": 0.25784773733651833, "learning_rate": 1.750291715285881e-05, "loss": 0.298, "step": 1305 }, { "epoch": 2.056692913385827, "grad_norm": 0.22900390243053315, "learning_rate": 1.7473745624270714e-05, "loss": 0.2823, "step": 1306 }, { "epoch": 2.058267716535433, "grad_norm": 0.2616699111367542, "learning_rate": 1.7444574095682615e-05, "loss": 0.2695, "step": 1307 }, { "epoch": 2.059842519685039, "grad_norm": 0.2496544392423694, "learning_rate": 1.7415402567094516e-05, "loss": 0.2726, "step": 1308 }, { "epoch": 2.0614173228346457, "grad_norm": 0.27676501128749154, "learning_rate": 1.7386231038506417e-05, "loss": 0.2859, "step": 1309 }, { "epoch": 2.062992125984252, "grad_norm": 0.24071329959434057, "learning_rate": 1.7357059509918318e-05, "loss": 0.2758, "step": 1310 }, { "epoch": 2.0645669291338584, "grad_norm": 0.28960843064116176, "learning_rate": 1.7327887981330223e-05, "loss": 0.2873, "step": 1311 }, { "epoch": 2.0661417322834645, "grad_norm": 0.26661694771496597, "learning_rate": 1.7298716452742124e-05, "loss": 0.2664, "step": 1312 }, { "epoch": 2.0677165354330707, "grad_norm": 0.25082576160584696, "learning_rate": 1.7269544924154028e-05, "loss": 0.2853, "step": 1313 }, { "epoch": 2.069291338582677, "grad_norm": 0.3161314984110379, "learning_rate": 1.724037339556593e-05, "loss": 0.2894, "step": 1314 }, { "epoch": 2.0708661417322833, "grad_norm": 0.24253311284774048, "learning_rate": 1.721120186697783e-05, "loss": 0.2911, "step": 1315 }, { "epoch": 2.07244094488189, "grad_norm": 0.25009636988379963, "learning_rate": 1.718203033838973e-05, "loss": 0.2929, "step": 1316 }, { "epoch": 2.074015748031496, "grad_norm": 0.2658496670874041, "learning_rate": 1.7152858809801633e-05, "loss": 0.2842, "step": 1317 }, { "epoch": 2.0755905511811026, "grad_norm": 0.23979816144995106, "learning_rate": 1.7123687281213534e-05, "loss": 0.2842, "step": 1318 }, { "epoch": 2.0771653543307087, "grad_norm": 0.24988604957704763, "learning_rate": 1.7094515752625438e-05, "loss": 0.2809, "step": 1319 }, { "epoch": 2.078740157480315, "grad_norm": 0.23668627480835644, "learning_rate": 1.7065344224037343e-05, "loss": 0.2867, "step": 1320 }, { "epoch": 2.0803149606299214, "grad_norm": 0.22372796962707625, "learning_rate": 1.7036172695449244e-05, "loss": 0.2691, "step": 1321 }, { "epoch": 2.0818897637795275, "grad_norm": 0.23732876028457656, "learning_rate": 1.7007001166861145e-05, "loss": 0.2683, "step": 1322 }, { "epoch": 2.083464566929134, "grad_norm": 0.22875191277447482, "learning_rate": 1.6977829638273046e-05, "loss": 0.2728, "step": 1323 }, { "epoch": 2.08503937007874, "grad_norm": 0.23650458357308932, "learning_rate": 1.6948658109684947e-05, "loss": 0.2963, "step": 1324 }, { "epoch": 2.0866141732283463, "grad_norm": 0.2546424552268046, "learning_rate": 1.6919486581096848e-05, "loss": 0.2771, "step": 1325 }, { "epoch": 2.088188976377953, "grad_norm": 0.24493578350201684, "learning_rate": 1.6890315052508752e-05, "loss": 0.282, "step": 1326 }, { "epoch": 2.089763779527559, "grad_norm": 0.24225174118695483, "learning_rate": 1.6861143523920657e-05, "loss": 0.277, "step": 1327 }, { "epoch": 2.0913385826771655, "grad_norm": 0.2681411145590331, "learning_rate": 1.6831971995332558e-05, "loss": 0.2897, "step": 1328 }, { "epoch": 2.0929133858267717, "grad_norm": 0.2390142702940837, "learning_rate": 1.680280046674446e-05, "loss": 0.2959, "step": 1329 }, { "epoch": 2.094488188976378, "grad_norm": 0.24863792236442755, "learning_rate": 1.677362893815636e-05, "loss": 0.2888, "step": 1330 }, { "epoch": 2.0960629921259843, "grad_norm": 0.24051214405460905, "learning_rate": 1.674445740956826e-05, "loss": 0.2914, "step": 1331 }, { "epoch": 2.0976377952755905, "grad_norm": 0.22172573929662698, "learning_rate": 1.6715285880980162e-05, "loss": 0.2585, "step": 1332 }, { "epoch": 2.099212598425197, "grad_norm": 0.2617419266454089, "learning_rate": 1.6686114352392067e-05, "loss": 0.2841, "step": 1333 }, { "epoch": 2.100787401574803, "grad_norm": 0.27223472458514913, "learning_rate": 1.6656942823803968e-05, "loss": 0.2928, "step": 1334 }, { "epoch": 2.1023622047244093, "grad_norm": 0.26039451991221524, "learning_rate": 1.662777129521587e-05, "loss": 0.2737, "step": 1335 }, { "epoch": 2.103937007874016, "grad_norm": 0.2562865619056178, "learning_rate": 1.6598599766627773e-05, "loss": 0.2775, "step": 1336 }, { "epoch": 2.105511811023622, "grad_norm": 0.26040814046855804, "learning_rate": 1.6569428238039674e-05, "loss": 0.2752, "step": 1337 }, { "epoch": 2.1070866141732285, "grad_norm": 0.22752834029005775, "learning_rate": 1.6540256709451576e-05, "loss": 0.2832, "step": 1338 }, { "epoch": 2.1086614173228346, "grad_norm": 0.23972798529269435, "learning_rate": 1.6511085180863477e-05, "loss": 0.2824, "step": 1339 }, { "epoch": 2.1102362204724407, "grad_norm": 0.24898300468340248, "learning_rate": 1.648191365227538e-05, "loss": 0.2745, "step": 1340 }, { "epoch": 2.1118110236220473, "grad_norm": 0.24442522354897925, "learning_rate": 1.6452742123687282e-05, "loss": 0.2807, "step": 1341 }, { "epoch": 2.1133858267716534, "grad_norm": 0.2535623827813203, "learning_rate": 1.6423570595099183e-05, "loss": 0.2796, "step": 1342 }, { "epoch": 2.11496062992126, "grad_norm": 0.2883204619299495, "learning_rate": 1.6394399066511084e-05, "loss": 0.2752, "step": 1343 }, { "epoch": 2.116535433070866, "grad_norm": 0.22998809460770003, "learning_rate": 1.636522753792299e-05, "loss": 0.2903, "step": 1344 }, { "epoch": 2.1181102362204722, "grad_norm": 0.23765343656180296, "learning_rate": 1.633605600933489e-05, "loss": 0.2757, "step": 1345 }, { "epoch": 2.119685039370079, "grad_norm": 0.24791736037741835, "learning_rate": 1.630688448074679e-05, "loss": 0.2808, "step": 1346 }, { "epoch": 2.121259842519685, "grad_norm": 0.22620748787468029, "learning_rate": 1.6277712952158695e-05, "loss": 0.2667, "step": 1347 }, { "epoch": 2.1228346456692915, "grad_norm": 0.6544321669332789, "learning_rate": 1.6248541423570597e-05, "loss": 0.3074, "step": 1348 }, { "epoch": 2.1244094488188976, "grad_norm": 0.21835065867133824, "learning_rate": 1.6219369894982498e-05, "loss": 0.2729, "step": 1349 }, { "epoch": 2.1259842519685037, "grad_norm": 0.246684226526046, "learning_rate": 1.61901983663944e-05, "loss": 0.2703, "step": 1350 }, { "epoch": 2.1275590551181103, "grad_norm": 0.22887837029059144, "learning_rate": 1.61610268378063e-05, "loss": 0.2842, "step": 1351 }, { "epoch": 2.1291338582677164, "grad_norm": 0.23674286923993498, "learning_rate": 1.6131855309218204e-05, "loss": 0.2797, "step": 1352 }, { "epoch": 2.130708661417323, "grad_norm": 0.24212923426766286, "learning_rate": 1.6102683780630105e-05, "loss": 0.2718, "step": 1353 }, { "epoch": 2.132283464566929, "grad_norm": 0.21506377400732163, "learning_rate": 1.607351225204201e-05, "loss": 0.279, "step": 1354 }, { "epoch": 2.1338582677165356, "grad_norm": 0.2528663771234912, "learning_rate": 1.604434072345391e-05, "loss": 0.2935, "step": 1355 }, { "epoch": 2.1354330708661418, "grad_norm": 0.24521268194124712, "learning_rate": 1.6015169194865812e-05, "loss": 0.2787, "step": 1356 }, { "epoch": 2.137007874015748, "grad_norm": 0.22831459808627608, "learning_rate": 1.5985997666277713e-05, "loss": 0.2773, "step": 1357 }, { "epoch": 2.1385826771653544, "grad_norm": 0.20428239023830516, "learning_rate": 1.5956826137689614e-05, "loss": 0.2855, "step": 1358 }, { "epoch": 2.1401574803149606, "grad_norm": 0.23502123031268532, "learning_rate": 1.5927654609101515e-05, "loss": 0.2788, "step": 1359 }, { "epoch": 2.141732283464567, "grad_norm": 0.2622004698147426, "learning_rate": 1.589848308051342e-05, "loss": 0.2996, "step": 1360 }, { "epoch": 2.1433070866141732, "grad_norm": 0.22041705193791555, "learning_rate": 1.5869311551925324e-05, "loss": 0.2741, "step": 1361 }, { "epoch": 2.1448818897637794, "grad_norm": 0.22131128949211973, "learning_rate": 1.5840140023337225e-05, "loss": 0.2926, "step": 1362 }, { "epoch": 2.146456692913386, "grad_norm": 0.25215920644805173, "learning_rate": 1.5810968494749126e-05, "loss": 0.2838, "step": 1363 }, { "epoch": 2.148031496062992, "grad_norm": 0.2302346840354795, "learning_rate": 1.5781796966161027e-05, "loss": 0.2808, "step": 1364 }, { "epoch": 2.1496062992125986, "grad_norm": 0.2200480819157691, "learning_rate": 1.575262543757293e-05, "loss": 0.2738, "step": 1365 }, { "epoch": 2.1511811023622047, "grad_norm": 0.24976965371931567, "learning_rate": 1.572345390898483e-05, "loss": 0.2829, "step": 1366 }, { "epoch": 2.152755905511811, "grad_norm": 0.24283142949444758, "learning_rate": 1.5694282380396734e-05, "loss": 0.2704, "step": 1367 }, { "epoch": 2.1543307086614174, "grad_norm": 0.25070443608136245, "learning_rate": 1.5665110851808635e-05, "loss": 0.2651, "step": 1368 }, { "epoch": 2.1559055118110235, "grad_norm": 0.24159502571476368, "learning_rate": 1.563593932322054e-05, "loss": 0.2833, "step": 1369 }, { "epoch": 2.15748031496063, "grad_norm": 0.23466970673361934, "learning_rate": 1.560676779463244e-05, "loss": 0.2824, "step": 1370 }, { "epoch": 2.159055118110236, "grad_norm": 0.23763022188882915, "learning_rate": 1.5577596266044342e-05, "loss": 0.2917, "step": 1371 }, { "epoch": 2.1606299212598423, "grad_norm": 0.2886619074992379, "learning_rate": 1.5548424737456243e-05, "loss": 0.2922, "step": 1372 }, { "epoch": 2.162204724409449, "grad_norm": 0.22185003529262515, "learning_rate": 1.5519253208868144e-05, "loss": 0.2701, "step": 1373 }, { "epoch": 2.163779527559055, "grad_norm": 0.24760989433416156, "learning_rate": 1.549008168028005e-05, "loss": 0.2892, "step": 1374 }, { "epoch": 2.1653543307086616, "grad_norm": 0.23818006899904232, "learning_rate": 1.546091015169195e-05, "loss": 0.2711, "step": 1375 }, { "epoch": 2.1669291338582677, "grad_norm": 0.24529281643195108, "learning_rate": 1.543173862310385e-05, "loss": 0.2829, "step": 1376 }, { "epoch": 2.1685039370078742, "grad_norm": 0.2461696820564374, "learning_rate": 1.5402567094515755e-05, "loss": 0.2842, "step": 1377 }, { "epoch": 2.1700787401574804, "grad_norm": 0.2301052446850812, "learning_rate": 1.5373395565927656e-05, "loss": 0.2798, "step": 1378 }, { "epoch": 2.1716535433070865, "grad_norm": 0.23909545436211746, "learning_rate": 1.5344224037339557e-05, "loss": 0.2847, "step": 1379 }, { "epoch": 2.173228346456693, "grad_norm": 0.253922961932347, "learning_rate": 1.5315052508751458e-05, "loss": 0.301, "step": 1380 }, { "epoch": 2.174803149606299, "grad_norm": 0.21008795832614036, "learning_rate": 1.5285880980163363e-05, "loss": 0.2815, "step": 1381 }, { "epoch": 2.1763779527559057, "grad_norm": 0.2425497542424157, "learning_rate": 1.5256709451575262e-05, "loss": 0.2806, "step": 1382 }, { "epoch": 2.177952755905512, "grad_norm": 0.25876241546814416, "learning_rate": 1.5227537922987165e-05, "loss": 0.2765, "step": 1383 }, { "epoch": 2.179527559055118, "grad_norm": 0.22528917655627814, "learning_rate": 1.5198366394399066e-05, "loss": 0.2763, "step": 1384 }, { "epoch": 2.1811023622047245, "grad_norm": 0.25064356337186394, "learning_rate": 1.516919486581097e-05, "loss": 0.2739, "step": 1385 }, { "epoch": 2.1826771653543307, "grad_norm": 0.23520827812001624, "learning_rate": 1.5140023337222871e-05, "loss": 0.293, "step": 1386 }, { "epoch": 2.184251968503937, "grad_norm": 0.2508476675586527, "learning_rate": 1.5110851808634774e-05, "loss": 0.2647, "step": 1387 }, { "epoch": 2.1858267716535433, "grad_norm": 0.2561527002430274, "learning_rate": 1.5081680280046675e-05, "loss": 0.2729, "step": 1388 }, { "epoch": 2.1874015748031495, "grad_norm": 0.2332213339299002, "learning_rate": 1.5052508751458576e-05, "loss": 0.2713, "step": 1389 }, { "epoch": 2.188976377952756, "grad_norm": 0.21905458942486522, "learning_rate": 1.502333722287048e-05, "loss": 0.2836, "step": 1390 }, { "epoch": 2.190551181102362, "grad_norm": 0.2705744504594355, "learning_rate": 1.499416569428238e-05, "loss": 0.2905, "step": 1391 }, { "epoch": 2.1921259842519687, "grad_norm": 0.2605971962981395, "learning_rate": 1.4964994165694281e-05, "loss": 0.2998, "step": 1392 }, { "epoch": 2.193700787401575, "grad_norm": 0.2347589072642403, "learning_rate": 1.4935822637106184e-05, "loss": 0.2914, "step": 1393 }, { "epoch": 2.195275590551181, "grad_norm": 0.2412984155258515, "learning_rate": 1.4906651108518089e-05, "loss": 0.2963, "step": 1394 }, { "epoch": 2.1968503937007875, "grad_norm": 0.22169458101240214, "learning_rate": 1.487747957992999e-05, "loss": 0.2918, "step": 1395 }, { "epoch": 2.1984251968503936, "grad_norm": 0.2084652960199894, "learning_rate": 1.484830805134189e-05, "loss": 0.2779, "step": 1396 }, { "epoch": 2.2, "grad_norm": 0.23400986114303154, "learning_rate": 1.4819136522753794e-05, "loss": 0.2806, "step": 1397 }, { "epoch": 2.2015748031496063, "grad_norm": 0.2359146546362693, "learning_rate": 1.4789964994165695e-05, "loss": 0.2774, "step": 1398 }, { "epoch": 2.2031496062992124, "grad_norm": 0.23315230359536931, "learning_rate": 1.4760793465577596e-05, "loss": 0.2763, "step": 1399 }, { "epoch": 2.204724409448819, "grad_norm": 0.2351399783096302, "learning_rate": 1.4731621936989498e-05, "loss": 0.2965, "step": 1400 }, { "epoch": 2.206299212598425, "grad_norm": 0.22365737178830594, "learning_rate": 1.47024504084014e-05, "loss": 0.2845, "step": 1401 }, { "epoch": 2.2078740157480317, "grad_norm": 0.22841685219674324, "learning_rate": 1.4673278879813304e-05, "loss": 0.2775, "step": 1402 }, { "epoch": 2.209448818897638, "grad_norm": 0.24548032870724185, "learning_rate": 1.4644107351225205e-05, "loss": 0.3018, "step": 1403 }, { "epoch": 2.211023622047244, "grad_norm": 0.21564890601073503, "learning_rate": 1.4614935822637108e-05, "loss": 0.2892, "step": 1404 }, { "epoch": 2.2125984251968505, "grad_norm": 0.23309310380884665, "learning_rate": 1.4585764294049009e-05, "loss": 0.2781, "step": 1405 }, { "epoch": 2.2141732283464566, "grad_norm": 0.23455583889915446, "learning_rate": 1.455659276546091e-05, "loss": 0.2902, "step": 1406 }, { "epoch": 2.215748031496063, "grad_norm": 0.2049467282303735, "learning_rate": 1.4527421236872813e-05, "loss": 0.2768, "step": 1407 }, { "epoch": 2.2173228346456693, "grad_norm": 0.22042204136808835, "learning_rate": 1.4498249708284714e-05, "loss": 0.2699, "step": 1408 }, { "epoch": 2.2188976377952754, "grad_norm": 0.2502099063608814, "learning_rate": 1.4469078179696615e-05, "loss": 0.2905, "step": 1409 }, { "epoch": 2.220472440944882, "grad_norm": 0.211467722088228, "learning_rate": 1.443990665110852e-05, "loss": 0.2687, "step": 1410 }, { "epoch": 2.222047244094488, "grad_norm": 0.23698845531523322, "learning_rate": 1.4410735122520422e-05, "loss": 0.2911, "step": 1411 }, { "epoch": 2.2236220472440946, "grad_norm": 0.22578663508874336, "learning_rate": 1.4381563593932323e-05, "loss": 0.278, "step": 1412 }, { "epoch": 2.2251968503937007, "grad_norm": 0.20880489789867324, "learning_rate": 1.4352392065344224e-05, "loss": 0.2803, "step": 1413 }, { "epoch": 2.226771653543307, "grad_norm": 0.21944842809904272, "learning_rate": 1.4323220536756127e-05, "loss": 0.2879, "step": 1414 }, { "epoch": 2.2283464566929134, "grad_norm": 0.25702557817192473, "learning_rate": 1.4294049008168028e-05, "loss": 0.2912, "step": 1415 }, { "epoch": 2.2299212598425195, "grad_norm": 0.21597500142385018, "learning_rate": 1.426487747957993e-05, "loss": 0.2847, "step": 1416 }, { "epoch": 2.231496062992126, "grad_norm": 0.24709859948687335, "learning_rate": 1.4235705950991832e-05, "loss": 0.2898, "step": 1417 }, { "epoch": 2.2330708661417322, "grad_norm": 0.20340729871368213, "learning_rate": 1.4206534422403733e-05, "loss": 0.2655, "step": 1418 }, { "epoch": 2.234645669291339, "grad_norm": 0.23429568774680629, "learning_rate": 1.4177362893815638e-05, "loss": 0.2806, "step": 1419 }, { "epoch": 2.236220472440945, "grad_norm": 0.21905303093648928, "learning_rate": 1.4148191365227539e-05, "loss": 0.2707, "step": 1420 }, { "epoch": 2.237795275590551, "grad_norm": 0.23113891304205073, "learning_rate": 1.4119019836639441e-05, "loss": 0.281, "step": 1421 }, { "epoch": 2.2393700787401576, "grad_norm": 0.2099793620452041, "learning_rate": 1.4089848308051343e-05, "loss": 0.2757, "step": 1422 }, { "epoch": 2.2409448818897637, "grad_norm": 0.22455055515096625, "learning_rate": 1.4060676779463244e-05, "loss": 0.2884, "step": 1423 }, { "epoch": 2.2425196850393703, "grad_norm": 0.22216112737063584, "learning_rate": 1.4031505250875146e-05, "loss": 0.2772, "step": 1424 }, { "epoch": 2.2440944881889764, "grad_norm": 0.22890795907884018, "learning_rate": 1.4002333722287048e-05, "loss": 0.287, "step": 1425 }, { "epoch": 2.2456692913385825, "grad_norm": 0.22883036310537475, "learning_rate": 1.3973162193698949e-05, "loss": 0.2763, "step": 1426 }, { "epoch": 2.247244094488189, "grad_norm": 0.2319656294207145, "learning_rate": 1.3943990665110853e-05, "loss": 0.277, "step": 1427 }, { "epoch": 2.248818897637795, "grad_norm": 0.21821696493151085, "learning_rate": 1.3914819136522756e-05, "loss": 0.2829, "step": 1428 }, { "epoch": 2.2503937007874018, "grad_norm": 0.2552943988138112, "learning_rate": 1.3885647607934657e-05, "loss": 0.2871, "step": 1429 }, { "epoch": 2.251968503937008, "grad_norm": 0.2235091776984639, "learning_rate": 1.3856476079346558e-05, "loss": 0.2793, "step": 1430 }, { "epoch": 2.253543307086614, "grad_norm": 0.1993036668354941, "learning_rate": 1.382730455075846e-05, "loss": 0.2803, "step": 1431 }, { "epoch": 2.2551181102362206, "grad_norm": 0.2359222305032181, "learning_rate": 1.3798133022170362e-05, "loss": 0.2905, "step": 1432 }, { "epoch": 2.2566929133858267, "grad_norm": 0.2729051154636401, "learning_rate": 1.3768961493582263e-05, "loss": 0.2987, "step": 1433 }, { "epoch": 2.2582677165354332, "grad_norm": 0.2179578554927583, "learning_rate": 1.3739789964994166e-05, "loss": 0.269, "step": 1434 }, { "epoch": 2.2598425196850394, "grad_norm": 0.23936849510184877, "learning_rate": 1.371061843640607e-05, "loss": 0.2797, "step": 1435 }, { "epoch": 2.261417322834646, "grad_norm": 0.2412608967170656, "learning_rate": 1.3681446907817971e-05, "loss": 0.2876, "step": 1436 }, { "epoch": 2.262992125984252, "grad_norm": 0.226499813050805, "learning_rate": 1.3652275379229872e-05, "loss": 0.3032, "step": 1437 }, { "epoch": 2.264566929133858, "grad_norm": 0.2199976583736491, "learning_rate": 1.3623103850641775e-05, "loss": 0.2809, "step": 1438 }, { "epoch": 2.2661417322834647, "grad_norm": 0.24026421130054815, "learning_rate": 1.3593932322053676e-05, "loss": 0.2741, "step": 1439 }, { "epoch": 2.267716535433071, "grad_norm": 0.22890754480508474, "learning_rate": 1.3564760793465577e-05, "loss": 0.2768, "step": 1440 }, { "epoch": 2.2692913385826774, "grad_norm": 0.2608325571405233, "learning_rate": 1.353558926487748e-05, "loss": 0.2892, "step": 1441 }, { "epoch": 2.2708661417322835, "grad_norm": 0.23235410160155848, "learning_rate": 1.3506417736289381e-05, "loss": 0.2911, "step": 1442 }, { "epoch": 2.2724409448818896, "grad_norm": 0.202658696387314, "learning_rate": 1.3477246207701282e-05, "loss": 0.2596, "step": 1443 }, { "epoch": 2.274015748031496, "grad_norm": 0.25964541414291453, "learning_rate": 1.3448074679113187e-05, "loss": 0.2873, "step": 1444 }, { "epoch": 2.2755905511811023, "grad_norm": 0.24789905318742037, "learning_rate": 1.341890315052509e-05, "loss": 0.2691, "step": 1445 }, { "epoch": 2.277165354330709, "grad_norm": 0.23652484951948843, "learning_rate": 1.338973162193699e-05, "loss": 0.2894, "step": 1446 }, { "epoch": 2.278740157480315, "grad_norm": 0.23973627803312308, "learning_rate": 1.3360560093348892e-05, "loss": 0.2858, "step": 1447 }, { "epoch": 2.280314960629921, "grad_norm": 0.25949837160384215, "learning_rate": 1.3331388564760794e-05, "loss": 0.3073, "step": 1448 }, { "epoch": 2.2818897637795277, "grad_norm": 0.22427301165764235, "learning_rate": 1.3302217036172695e-05, "loss": 0.2696, "step": 1449 }, { "epoch": 2.283464566929134, "grad_norm": 0.21396123160817415, "learning_rate": 1.3273045507584597e-05, "loss": 0.2837, "step": 1450 }, { "epoch": 2.2850393700787404, "grad_norm": 0.22496174832515922, "learning_rate": 1.32438739789965e-05, "loss": 0.2896, "step": 1451 }, { "epoch": 2.2866141732283465, "grad_norm": 0.22696077864978292, "learning_rate": 1.3214702450408404e-05, "loss": 0.2744, "step": 1452 }, { "epoch": 2.2881889763779526, "grad_norm": 0.21942757714324265, "learning_rate": 1.3185530921820305e-05, "loss": 0.2744, "step": 1453 }, { "epoch": 2.289763779527559, "grad_norm": 0.24543293776771777, "learning_rate": 1.3156359393232206e-05, "loss": 0.2859, "step": 1454 }, { "epoch": 2.2913385826771653, "grad_norm": 0.21006253185071247, "learning_rate": 1.3127187864644109e-05, "loss": 0.2792, "step": 1455 }, { "epoch": 2.292913385826772, "grad_norm": 0.23218821171413773, "learning_rate": 1.309801633605601e-05, "loss": 0.2792, "step": 1456 }, { "epoch": 2.294488188976378, "grad_norm": 0.23288877250268153, "learning_rate": 1.3068844807467911e-05, "loss": 0.2925, "step": 1457 }, { "epoch": 2.296062992125984, "grad_norm": 0.22499908227522314, "learning_rate": 1.3039673278879814e-05, "loss": 0.2699, "step": 1458 }, { "epoch": 2.2976377952755906, "grad_norm": 0.23230130367468854, "learning_rate": 1.3010501750291715e-05, "loss": 0.2669, "step": 1459 }, { "epoch": 2.2992125984251968, "grad_norm": 0.23444693531616803, "learning_rate": 1.298133022170362e-05, "loss": 0.2827, "step": 1460 }, { "epoch": 2.3007874015748033, "grad_norm": 0.2356316683779814, "learning_rate": 1.295215869311552e-05, "loss": 0.2868, "step": 1461 }, { "epoch": 2.3023622047244094, "grad_norm": 0.23824202516937876, "learning_rate": 1.2922987164527423e-05, "loss": 0.2894, "step": 1462 }, { "epoch": 2.3039370078740156, "grad_norm": 0.2403836767599592, "learning_rate": 1.2893815635939324e-05, "loss": 0.2927, "step": 1463 }, { "epoch": 2.305511811023622, "grad_norm": 0.23584475677528402, "learning_rate": 1.2864644107351225e-05, "loss": 0.2993, "step": 1464 }, { "epoch": 2.3070866141732282, "grad_norm": 0.23135672907876287, "learning_rate": 1.2835472578763128e-05, "loss": 0.2905, "step": 1465 }, { "epoch": 2.308661417322835, "grad_norm": 0.23015072934549755, "learning_rate": 1.2806301050175029e-05, "loss": 0.2721, "step": 1466 }, { "epoch": 2.310236220472441, "grad_norm": 0.2480610402143088, "learning_rate": 1.277712952158693e-05, "loss": 0.2797, "step": 1467 }, { "epoch": 2.311811023622047, "grad_norm": 0.22711118629636298, "learning_rate": 1.2747957992998833e-05, "loss": 0.2646, "step": 1468 }, { "epoch": 2.3133858267716536, "grad_norm": 0.24780842213285867, "learning_rate": 1.2718786464410737e-05, "loss": 0.282, "step": 1469 }, { "epoch": 2.3149606299212597, "grad_norm": 0.21153158161732674, "learning_rate": 1.2689614935822638e-05, "loss": 0.2795, "step": 1470 }, { "epoch": 2.3165354330708663, "grad_norm": 0.24563733158100262, "learning_rate": 1.266044340723454e-05, "loss": 0.2776, "step": 1471 }, { "epoch": 2.3181102362204724, "grad_norm": 0.21651909369642927, "learning_rate": 1.2631271878646442e-05, "loss": 0.2773, "step": 1472 }, { "epoch": 2.3196850393700785, "grad_norm": 0.23132674679630488, "learning_rate": 1.2602100350058343e-05, "loss": 0.2761, "step": 1473 }, { "epoch": 2.321259842519685, "grad_norm": 0.2350632512398343, "learning_rate": 1.2572928821470245e-05, "loss": 0.2973, "step": 1474 }, { "epoch": 2.322834645669291, "grad_norm": 0.23907706082723623, "learning_rate": 1.2543757292882147e-05, "loss": 0.2741, "step": 1475 }, { "epoch": 2.3244094488188978, "grad_norm": 0.22415492629302994, "learning_rate": 1.2514585764294048e-05, "loss": 0.286, "step": 1476 }, { "epoch": 2.325984251968504, "grad_norm": 0.22820272633596786, "learning_rate": 1.2485414235705951e-05, "loss": 0.2789, "step": 1477 }, { "epoch": 2.32755905511811, "grad_norm": 0.23584855135917457, "learning_rate": 1.2456242707117854e-05, "loss": 0.2876, "step": 1478 }, { "epoch": 2.3291338582677166, "grad_norm": 0.2136574687001148, "learning_rate": 1.2427071178529757e-05, "loss": 0.2858, "step": 1479 }, { "epoch": 2.3307086614173227, "grad_norm": 0.19982376514401443, "learning_rate": 1.2397899649941658e-05, "loss": 0.2697, "step": 1480 }, { "epoch": 2.3322834645669293, "grad_norm": 0.23896788687398224, "learning_rate": 1.2368728121353559e-05, "loss": 0.2843, "step": 1481 }, { "epoch": 2.3338582677165354, "grad_norm": 0.22747228194602734, "learning_rate": 1.2339556592765462e-05, "loss": 0.291, "step": 1482 }, { "epoch": 2.3354330708661415, "grad_norm": 0.22685121784824175, "learning_rate": 1.2310385064177364e-05, "loss": 0.2707, "step": 1483 }, { "epoch": 2.337007874015748, "grad_norm": 0.2230208494698019, "learning_rate": 1.2281213535589265e-05, "loss": 0.2793, "step": 1484 }, { "epoch": 2.338582677165354, "grad_norm": 0.2388996248403338, "learning_rate": 1.2252042007001167e-05, "loss": 0.2683, "step": 1485 }, { "epoch": 2.3401574803149607, "grad_norm": 0.22705055469460048, "learning_rate": 1.222287047841307e-05, "loss": 0.2785, "step": 1486 }, { "epoch": 2.341732283464567, "grad_norm": 0.2274162064917656, "learning_rate": 1.2193698949824972e-05, "loss": 0.2738, "step": 1487 }, { "epoch": 2.3433070866141734, "grad_norm": 0.22920156734647806, "learning_rate": 1.2164527421236873e-05, "loss": 0.2738, "step": 1488 }, { "epoch": 2.3448818897637795, "grad_norm": 0.2286138374517549, "learning_rate": 1.2135355892648776e-05, "loss": 0.2935, "step": 1489 }, { "epoch": 2.3464566929133857, "grad_norm": 0.20774984759005224, "learning_rate": 1.2106184364060677e-05, "loss": 0.2764, "step": 1490 }, { "epoch": 2.348031496062992, "grad_norm": 0.23839924662863565, "learning_rate": 1.2077012835472578e-05, "loss": 0.3008, "step": 1491 }, { "epoch": 2.3496062992125983, "grad_norm": 0.2402139528221408, "learning_rate": 1.2047841306884481e-05, "loss": 0.2854, "step": 1492 }, { "epoch": 2.351181102362205, "grad_norm": 0.2440316683522424, "learning_rate": 1.2018669778296384e-05, "loss": 0.2796, "step": 1493 }, { "epoch": 2.352755905511811, "grad_norm": 0.20868375661138377, "learning_rate": 1.1989498249708285e-05, "loss": 0.2821, "step": 1494 }, { "epoch": 2.354330708661417, "grad_norm": 0.20557383633620582, "learning_rate": 1.1960326721120186e-05, "loss": 0.266, "step": 1495 }, { "epoch": 2.3559055118110237, "grad_norm": 0.242255306179062, "learning_rate": 1.193115519253209e-05, "loss": 0.2831, "step": 1496 }, { "epoch": 2.35748031496063, "grad_norm": 0.21949714490513378, "learning_rate": 1.1901983663943991e-05, "loss": 0.2663, "step": 1497 }, { "epoch": 2.3590551181102364, "grad_norm": 0.21319976325830786, "learning_rate": 1.1872812135355892e-05, "loss": 0.2669, "step": 1498 }, { "epoch": 2.3606299212598425, "grad_norm": 0.22991739644915404, "learning_rate": 1.1843640606767795e-05, "loss": 0.2864, "step": 1499 }, { "epoch": 2.362204724409449, "grad_norm": 0.25101421181292954, "learning_rate": 1.1814469078179698e-05, "loss": 0.2816, "step": 1500 }, { "epoch": 2.363779527559055, "grad_norm": 0.23705768912379516, "learning_rate": 1.1785297549591599e-05, "loss": 0.2781, "step": 1501 }, { "epoch": 2.3653543307086613, "grad_norm": 0.2226577821414264, "learning_rate": 1.17561260210035e-05, "loss": 0.2751, "step": 1502 }, { "epoch": 2.366929133858268, "grad_norm": 0.2431090868617522, "learning_rate": 1.1726954492415403e-05, "loss": 0.2928, "step": 1503 }, { "epoch": 2.368503937007874, "grad_norm": 0.21770364909969447, "learning_rate": 1.1697782963827306e-05, "loss": 0.2765, "step": 1504 }, { "epoch": 2.3700787401574805, "grad_norm": 0.21678495566821132, "learning_rate": 1.1668611435239207e-05, "loss": 0.2718, "step": 1505 }, { "epoch": 2.3716535433070867, "grad_norm": 0.23717746638867285, "learning_rate": 1.163943990665111e-05, "loss": 0.2775, "step": 1506 }, { "epoch": 2.373228346456693, "grad_norm": 0.22348402929681743, "learning_rate": 1.161026837806301e-05, "loss": 0.2824, "step": 1507 }, { "epoch": 2.3748031496062993, "grad_norm": 0.2198468285896082, "learning_rate": 1.1581096849474913e-05, "loss": 0.274, "step": 1508 }, { "epoch": 2.3763779527559055, "grad_norm": 0.22328171225445959, "learning_rate": 1.1551925320886815e-05, "loss": 0.2787, "step": 1509 }, { "epoch": 2.377952755905512, "grad_norm": 0.21354981977009435, "learning_rate": 1.1522753792298717e-05, "loss": 0.2777, "step": 1510 }, { "epoch": 2.379527559055118, "grad_norm": 0.22045299439391677, "learning_rate": 1.1493582263710618e-05, "loss": 0.2733, "step": 1511 }, { "epoch": 2.3811023622047243, "grad_norm": 0.21432108217710003, "learning_rate": 1.1464410735122521e-05, "loss": 0.2943, "step": 1512 }, { "epoch": 2.382677165354331, "grad_norm": 0.22789649691484323, "learning_rate": 1.1435239206534424e-05, "loss": 0.2797, "step": 1513 }, { "epoch": 2.384251968503937, "grad_norm": 0.22109501014467292, "learning_rate": 1.1406067677946325e-05, "loss": 0.2773, "step": 1514 }, { "epoch": 2.3858267716535435, "grad_norm": 0.23639477603426273, "learning_rate": 1.1376896149358226e-05, "loss": 0.2819, "step": 1515 }, { "epoch": 2.3874015748031496, "grad_norm": 0.21232181108772216, "learning_rate": 1.1347724620770129e-05, "loss": 0.2748, "step": 1516 }, { "epoch": 2.3889763779527557, "grad_norm": 0.2143371342893271, "learning_rate": 1.1318553092182032e-05, "loss": 0.2823, "step": 1517 }, { "epoch": 2.3905511811023623, "grad_norm": 0.2188004380425287, "learning_rate": 1.1289381563593933e-05, "loss": 0.2921, "step": 1518 }, { "epoch": 2.3921259842519684, "grad_norm": 0.2042107205061148, "learning_rate": 1.1260210035005834e-05, "loss": 0.2809, "step": 1519 }, { "epoch": 2.393700787401575, "grad_norm": 0.21127254861061864, "learning_rate": 1.1231038506417737e-05, "loss": 0.2729, "step": 1520 }, { "epoch": 2.395275590551181, "grad_norm": 0.2122687761473091, "learning_rate": 1.120186697782964e-05, "loss": 0.2741, "step": 1521 }, { "epoch": 2.3968503937007872, "grad_norm": 0.20665000726291663, "learning_rate": 1.117269544924154e-05, "loss": 0.2752, "step": 1522 }, { "epoch": 2.398425196850394, "grad_norm": 0.21129989766101856, "learning_rate": 1.1143523920653443e-05, "loss": 0.2581, "step": 1523 }, { "epoch": 2.4, "grad_norm": 0.2171085373656638, "learning_rate": 1.1114352392065344e-05, "loss": 0.2805, "step": 1524 }, { "epoch": 2.4015748031496065, "grad_norm": 0.21791732544784653, "learning_rate": 1.1085180863477247e-05, "loss": 0.2891, "step": 1525 }, { "epoch": 2.4031496062992126, "grad_norm": 0.2215694958882201, "learning_rate": 1.1056009334889148e-05, "loss": 0.2805, "step": 1526 }, { "epoch": 2.4047244094488187, "grad_norm": 0.2269596943048426, "learning_rate": 1.1026837806301051e-05, "loss": 0.29, "step": 1527 }, { "epoch": 2.4062992125984253, "grad_norm": 0.23722965063051302, "learning_rate": 1.0997666277712952e-05, "loss": 0.2754, "step": 1528 }, { "epoch": 2.4078740157480314, "grad_norm": 0.2508999858990772, "learning_rate": 1.0968494749124855e-05, "loss": 0.2783, "step": 1529 }, { "epoch": 2.409448818897638, "grad_norm": 0.20771076086807894, "learning_rate": 1.0939323220536758e-05, "loss": 0.2688, "step": 1530 }, { "epoch": 2.411023622047244, "grad_norm": 0.21976725364658659, "learning_rate": 1.0910151691948659e-05, "loss": 0.2828, "step": 1531 }, { "epoch": 2.41259842519685, "grad_norm": 0.24663360181075955, "learning_rate": 1.088098016336056e-05, "loss": 0.293, "step": 1532 }, { "epoch": 2.4141732283464568, "grad_norm": 0.2352876990096626, "learning_rate": 1.0851808634772462e-05, "loss": 0.2612, "step": 1533 }, { "epoch": 2.415748031496063, "grad_norm": 0.25317330602993626, "learning_rate": 1.0822637106184365e-05, "loss": 0.2877, "step": 1534 }, { "epoch": 2.4173228346456694, "grad_norm": 0.24664076848527355, "learning_rate": 1.0793465577596266e-05, "loss": 0.2826, "step": 1535 }, { "epoch": 2.4188976377952756, "grad_norm": 0.2343219614595752, "learning_rate": 1.0764294049008167e-05, "loss": 0.2681, "step": 1536 }, { "epoch": 2.4204724409448817, "grad_norm": 0.24747077573413845, "learning_rate": 1.0735122520420072e-05, "loss": 0.2844, "step": 1537 }, { "epoch": 2.4220472440944882, "grad_norm": 0.2242193008158641, "learning_rate": 1.0705950991831973e-05, "loss": 0.2706, "step": 1538 }, { "epoch": 2.4236220472440944, "grad_norm": 0.25594864846267806, "learning_rate": 1.0676779463243874e-05, "loss": 0.2738, "step": 1539 }, { "epoch": 2.425196850393701, "grad_norm": 0.2581894149550932, "learning_rate": 1.0647607934655777e-05, "loss": 0.2693, "step": 1540 }, { "epoch": 2.426771653543307, "grad_norm": 0.24012269030792346, "learning_rate": 1.061843640606768e-05, "loss": 0.2764, "step": 1541 }, { "epoch": 2.428346456692913, "grad_norm": 0.21415227679458, "learning_rate": 1.058926487747958e-05, "loss": 0.2934, "step": 1542 }, { "epoch": 2.4299212598425197, "grad_norm": 0.21650512053546528, "learning_rate": 1.0560093348891482e-05, "loss": 0.2765, "step": 1543 }, { "epoch": 2.431496062992126, "grad_norm": 0.23760245151421075, "learning_rate": 1.0530921820303385e-05, "loss": 0.2851, "step": 1544 }, { "epoch": 2.4330708661417324, "grad_norm": 0.23411773364431143, "learning_rate": 1.0501750291715286e-05, "loss": 0.2685, "step": 1545 }, { "epoch": 2.4346456692913385, "grad_norm": 0.27041939036200335, "learning_rate": 1.0472578763127188e-05, "loss": 0.2751, "step": 1546 }, { "epoch": 2.4362204724409446, "grad_norm": 0.24019614257396077, "learning_rate": 1.0443407234539091e-05, "loss": 0.2907, "step": 1547 }, { "epoch": 2.437795275590551, "grad_norm": 0.2261132094138344, "learning_rate": 1.0414235705950992e-05, "loss": 0.2781, "step": 1548 }, { "epoch": 2.4393700787401573, "grad_norm": 0.2209320915721896, "learning_rate": 1.0385064177362893e-05, "loss": 0.2736, "step": 1549 }, { "epoch": 2.440944881889764, "grad_norm": 0.2151656286369365, "learning_rate": 1.0355892648774796e-05, "loss": 0.2846, "step": 1550 }, { "epoch": 2.44251968503937, "grad_norm": 0.22594174293026084, "learning_rate": 1.0326721120186699e-05, "loss": 0.2841, "step": 1551 }, { "epoch": 2.4440944881889766, "grad_norm": 0.25141057010642087, "learning_rate": 1.02975495915986e-05, "loss": 0.2782, "step": 1552 }, { "epoch": 2.4456692913385827, "grad_norm": 0.21335172090106505, "learning_rate": 1.0268378063010501e-05, "loss": 0.2712, "step": 1553 }, { "epoch": 2.447244094488189, "grad_norm": 0.22178831728875492, "learning_rate": 1.0239206534422406e-05, "loss": 0.2903, "step": 1554 }, { "epoch": 2.4488188976377954, "grad_norm": 0.22419957644611535, "learning_rate": 1.0210035005834307e-05, "loss": 0.2897, "step": 1555 }, { "epoch": 2.4503937007874015, "grad_norm": 0.23141135337182286, "learning_rate": 1.0180863477246208e-05, "loss": 0.2728, "step": 1556 }, { "epoch": 2.451968503937008, "grad_norm": 0.23390047568499003, "learning_rate": 1.015169194865811e-05, "loss": 0.2962, "step": 1557 }, { "epoch": 2.453543307086614, "grad_norm": 0.21003554226923246, "learning_rate": 1.0122520420070013e-05, "loss": 0.2725, "step": 1558 }, { "epoch": 2.4551181102362203, "grad_norm": 0.21612342940729654, "learning_rate": 1.0093348891481914e-05, "loss": 0.2851, "step": 1559 }, { "epoch": 2.456692913385827, "grad_norm": 0.21579805426897644, "learning_rate": 1.0064177362893815e-05, "loss": 0.2677, "step": 1560 }, { "epoch": 2.458267716535433, "grad_norm": 0.23142865481856237, "learning_rate": 1.0035005834305718e-05, "loss": 0.2904, "step": 1561 }, { "epoch": 2.4598425196850395, "grad_norm": 0.22683025077436034, "learning_rate": 1.0005834305717621e-05, "loss": 0.2683, "step": 1562 }, { "epoch": 2.4614173228346456, "grad_norm": 0.23219050407345931, "learning_rate": 9.976662777129522e-06, "loss": 0.2898, "step": 1563 }, { "epoch": 2.462992125984252, "grad_norm": 0.23345863383496632, "learning_rate": 9.947491248541425e-06, "loss": 0.2821, "step": 1564 }, { "epoch": 2.4645669291338583, "grad_norm": 0.2313719262994064, "learning_rate": 9.918319719953326e-06, "loss": 0.2842, "step": 1565 }, { "epoch": 2.4661417322834644, "grad_norm": 0.23295077041748086, "learning_rate": 9.889148191365229e-06, "loss": 0.2644, "step": 1566 }, { "epoch": 2.467716535433071, "grad_norm": 0.24008399414032688, "learning_rate": 9.85997666277713e-06, "loss": 0.273, "step": 1567 }, { "epoch": 2.469291338582677, "grad_norm": 0.2212493285287193, "learning_rate": 9.830805134189033e-06, "loss": 0.2637, "step": 1568 }, { "epoch": 2.4708661417322837, "grad_norm": 0.22366809978258886, "learning_rate": 9.801633605600934e-06, "loss": 0.2762, "step": 1569 }, { "epoch": 2.47244094488189, "grad_norm": 0.2117842425421386, "learning_rate": 9.772462077012835e-06, "loss": 0.2662, "step": 1570 }, { "epoch": 2.474015748031496, "grad_norm": 0.21503140772998636, "learning_rate": 9.743290548424739e-06, "loss": 0.2726, "step": 1571 }, { "epoch": 2.4755905511811025, "grad_norm": 0.24696168727993628, "learning_rate": 9.71411901983664e-06, "loss": 0.2802, "step": 1572 }, { "epoch": 2.4771653543307086, "grad_norm": 0.2146792655199464, "learning_rate": 9.684947491248541e-06, "loss": 0.2827, "step": 1573 }, { "epoch": 2.478740157480315, "grad_norm": 0.2389986980129217, "learning_rate": 9.655775962660444e-06, "loss": 0.276, "step": 1574 }, { "epoch": 2.4803149606299213, "grad_norm": 0.21603411746296194, "learning_rate": 9.626604434072347e-06, "loss": 0.2765, "step": 1575 }, { "epoch": 2.4818897637795274, "grad_norm": 0.2014675773585549, "learning_rate": 9.597432905484248e-06, "loss": 0.2653, "step": 1576 }, { "epoch": 2.483464566929134, "grad_norm": 0.20360452731642248, "learning_rate": 9.568261376896149e-06, "loss": 0.2694, "step": 1577 }, { "epoch": 2.48503937007874, "grad_norm": 0.219746783914528, "learning_rate": 9.539089848308052e-06, "loss": 0.2679, "step": 1578 }, { "epoch": 2.4866141732283467, "grad_norm": 0.25098859749231456, "learning_rate": 9.509918319719955e-06, "loss": 0.2832, "step": 1579 }, { "epoch": 2.4881889763779528, "grad_norm": 0.20653083376483433, "learning_rate": 9.480746791131856e-06, "loss": 0.286, "step": 1580 }, { "epoch": 2.489763779527559, "grad_norm": 0.23494835251090038, "learning_rate": 9.451575262543758e-06, "loss": 0.2989, "step": 1581 }, { "epoch": 2.4913385826771655, "grad_norm": 0.2151851887988243, "learning_rate": 9.42240373395566e-06, "loss": 0.2859, "step": 1582 }, { "epoch": 2.4929133858267716, "grad_norm": 0.20959660021964888, "learning_rate": 9.393232205367562e-06, "loss": 0.2682, "step": 1583 }, { "epoch": 2.494488188976378, "grad_norm": 0.22083698125931614, "learning_rate": 9.364060676779463e-06, "loss": 0.2815, "step": 1584 }, { "epoch": 2.4960629921259843, "grad_norm": 0.2060733077396822, "learning_rate": 9.334889148191366e-06, "loss": 0.2767, "step": 1585 }, { "epoch": 2.4976377952755904, "grad_norm": 0.2191810531141266, "learning_rate": 9.305717619603267e-06, "loss": 0.2821, "step": 1586 }, { "epoch": 2.499212598425197, "grad_norm": 0.24104574209992258, "learning_rate": 9.27654609101517e-06, "loss": 0.2792, "step": 1587 }, { "epoch": 2.500787401574803, "grad_norm": 0.20589157910814646, "learning_rate": 9.247374562427071e-06, "loss": 0.2891, "step": 1588 }, { "epoch": 2.5023622047244096, "grad_norm": 0.22542796894357212, "learning_rate": 9.218203033838974e-06, "loss": 0.2836, "step": 1589 }, { "epoch": 2.5039370078740157, "grad_norm": 0.2563217614657794, "learning_rate": 9.189031505250875e-06, "loss": 0.2789, "step": 1590 }, { "epoch": 2.505511811023622, "grad_norm": 0.25615389339132, "learning_rate": 9.159859976662778e-06, "loss": 0.2816, "step": 1591 }, { "epoch": 2.5070866141732284, "grad_norm": 0.2332835378938043, "learning_rate": 9.13068844807468e-06, "loss": 0.2831, "step": 1592 }, { "epoch": 2.5086614173228345, "grad_norm": 0.21646491398183093, "learning_rate": 9.101516919486582e-06, "loss": 0.27, "step": 1593 }, { "epoch": 2.510236220472441, "grad_norm": 0.22750013988673193, "learning_rate": 9.072345390898483e-06, "loss": 0.2854, "step": 1594 }, { "epoch": 2.5118110236220472, "grad_norm": 0.2135283665329251, "learning_rate": 9.043173862310385e-06, "loss": 0.272, "step": 1595 }, { "epoch": 2.5133858267716533, "grad_norm": 0.2271494001556057, "learning_rate": 9.014002333722288e-06, "loss": 0.2875, "step": 1596 }, { "epoch": 2.51496062992126, "grad_norm": 0.22581689068890634, "learning_rate": 8.98483080513419e-06, "loss": 0.276, "step": 1597 }, { "epoch": 2.516535433070866, "grad_norm": 0.21202699733241173, "learning_rate": 8.95565927654609e-06, "loss": 0.2767, "step": 1598 }, { "epoch": 2.5181102362204726, "grad_norm": 0.21931258685869834, "learning_rate": 8.926487747957993e-06, "loss": 0.2719, "step": 1599 }, { "epoch": 2.5196850393700787, "grad_norm": 0.22841892387469004, "learning_rate": 8.897316219369896e-06, "loss": 0.2774, "step": 1600 }, { "epoch": 2.521259842519685, "grad_norm": 0.21218663010662256, "learning_rate": 8.868144690781797e-06, "loss": 0.2701, "step": 1601 }, { "epoch": 2.5228346456692914, "grad_norm": 0.22680808879380912, "learning_rate": 8.8389731621937e-06, "loss": 0.2934, "step": 1602 }, { "epoch": 2.5244094488188975, "grad_norm": 0.24251255725808457, "learning_rate": 8.8098016336056e-06, "loss": 0.2781, "step": 1603 }, { "epoch": 2.525984251968504, "grad_norm": 0.22921710128147504, "learning_rate": 8.780630105017504e-06, "loss": 0.2778, "step": 1604 }, { "epoch": 2.52755905511811, "grad_norm": 0.21560031994059373, "learning_rate": 8.751458576429405e-06, "loss": 0.2981, "step": 1605 }, { "epoch": 2.5291338582677163, "grad_norm": 0.20343882247504602, "learning_rate": 8.722287047841307e-06, "loss": 0.2793, "step": 1606 }, { "epoch": 2.530708661417323, "grad_norm": 0.20857262962631817, "learning_rate": 8.693115519253209e-06, "loss": 0.2843, "step": 1607 }, { "epoch": 2.532283464566929, "grad_norm": 0.22778736153812304, "learning_rate": 8.663943990665111e-06, "loss": 0.2746, "step": 1608 }, { "epoch": 2.5338582677165356, "grad_norm": 0.21946021663921056, "learning_rate": 8.634772462077014e-06, "loss": 0.2901, "step": 1609 }, { "epoch": 2.5354330708661417, "grad_norm": 0.2189846694117503, "learning_rate": 8.605600933488915e-06, "loss": 0.2871, "step": 1610 }, { "epoch": 2.537007874015748, "grad_norm": 0.25116574321110197, "learning_rate": 8.576429404900816e-06, "loss": 0.2943, "step": 1611 }, { "epoch": 2.5385826771653544, "grad_norm": 0.20949365737383666, "learning_rate": 8.547257876312719e-06, "loss": 0.2844, "step": 1612 }, { "epoch": 2.5401574803149605, "grad_norm": 0.2295195090881187, "learning_rate": 8.518086347724622e-06, "loss": 0.2708, "step": 1613 }, { "epoch": 2.541732283464567, "grad_norm": 0.2325136132252242, "learning_rate": 8.488914819136523e-06, "loss": 0.2936, "step": 1614 }, { "epoch": 2.543307086614173, "grad_norm": 0.19744218073866762, "learning_rate": 8.459743290548424e-06, "loss": 0.2896, "step": 1615 }, { "epoch": 2.5448818897637793, "grad_norm": 0.2118465596658989, "learning_rate": 8.430571761960328e-06, "loss": 0.274, "step": 1616 }, { "epoch": 2.546456692913386, "grad_norm": 0.20927617550289612, "learning_rate": 8.40140023337223e-06, "loss": 0.2849, "step": 1617 }, { "epoch": 2.5480314960629924, "grad_norm": 0.23157845023389328, "learning_rate": 8.37222870478413e-06, "loss": 0.2832, "step": 1618 }, { "epoch": 2.5496062992125985, "grad_norm": 0.20996749809151966, "learning_rate": 8.343057176196033e-06, "loss": 0.2763, "step": 1619 }, { "epoch": 2.5511811023622046, "grad_norm": 0.20096857831912, "learning_rate": 8.313885647607934e-06, "loss": 0.2702, "step": 1620 }, { "epoch": 2.5527559055118108, "grad_norm": 0.22966972955343162, "learning_rate": 8.284714119019837e-06, "loss": 0.2802, "step": 1621 }, { "epoch": 2.5543307086614173, "grad_norm": 0.2114136868467788, "learning_rate": 8.255542590431738e-06, "loss": 0.2725, "step": 1622 }, { "epoch": 2.555905511811024, "grad_norm": 0.20204007603556268, "learning_rate": 8.226371061843641e-06, "loss": 0.2815, "step": 1623 }, { "epoch": 2.55748031496063, "grad_norm": 0.20367219710056866, "learning_rate": 8.197199533255542e-06, "loss": 0.2769, "step": 1624 }, { "epoch": 2.559055118110236, "grad_norm": 0.21235216343150856, "learning_rate": 8.168028004667445e-06, "loss": 0.2769, "step": 1625 }, { "epoch": 2.5606299212598427, "grad_norm": 0.23136619837973296, "learning_rate": 8.138856476079348e-06, "loss": 0.2891, "step": 1626 }, { "epoch": 2.562204724409449, "grad_norm": 0.2018467310751204, "learning_rate": 8.109684947491249e-06, "loss": 0.2737, "step": 1627 }, { "epoch": 2.5637795275590554, "grad_norm": 0.23307849213617168, "learning_rate": 8.08051341890315e-06, "loss": 0.2973, "step": 1628 }, { "epoch": 2.5653543307086615, "grad_norm": 0.2146537486220867, "learning_rate": 8.051341890315053e-06, "loss": 0.276, "step": 1629 }, { "epoch": 2.5669291338582676, "grad_norm": 0.204181644579748, "learning_rate": 8.022170361726955e-06, "loss": 0.2808, "step": 1630 }, { "epoch": 2.568503937007874, "grad_norm": 0.24200639155128334, "learning_rate": 7.992998833138857e-06, "loss": 0.297, "step": 1631 }, { "epoch": 2.5700787401574803, "grad_norm": 0.22263508568312063, "learning_rate": 7.963827304550758e-06, "loss": 0.2726, "step": 1632 }, { "epoch": 2.571653543307087, "grad_norm": 0.21698011138075388, "learning_rate": 7.934655775962662e-06, "loss": 0.2766, "step": 1633 }, { "epoch": 2.573228346456693, "grad_norm": 0.20819366600772862, "learning_rate": 7.905484247374563e-06, "loss": 0.2895, "step": 1634 }, { "epoch": 2.574803149606299, "grad_norm": 0.22297271410400557, "learning_rate": 7.876312718786464e-06, "loss": 0.2862, "step": 1635 }, { "epoch": 2.5763779527559056, "grad_norm": 0.21426004192590528, "learning_rate": 7.847141190198367e-06, "loss": 0.2706, "step": 1636 }, { "epoch": 2.5779527559055118, "grad_norm": 0.21229238144122367, "learning_rate": 7.81796966161027e-06, "loss": 0.2888, "step": 1637 }, { "epoch": 2.5795275590551183, "grad_norm": 0.21403170650905695, "learning_rate": 7.788798133022171e-06, "loss": 0.2698, "step": 1638 }, { "epoch": 2.5811023622047244, "grad_norm": 0.20900130687194937, "learning_rate": 7.759626604434072e-06, "loss": 0.2741, "step": 1639 }, { "epoch": 2.5826771653543306, "grad_norm": 0.22055239327798076, "learning_rate": 7.730455075845975e-06, "loss": 0.2782, "step": 1640 }, { "epoch": 2.584251968503937, "grad_norm": 0.21444724932167594, "learning_rate": 7.701283547257877e-06, "loss": 0.2891, "step": 1641 }, { "epoch": 2.5858267716535432, "grad_norm": 0.20626643452167062, "learning_rate": 7.672112018669779e-06, "loss": 0.2815, "step": 1642 }, { "epoch": 2.58740157480315, "grad_norm": 0.22590023389779978, "learning_rate": 7.642940490081681e-06, "loss": 0.273, "step": 1643 }, { "epoch": 2.588976377952756, "grad_norm": 0.22537883130665526, "learning_rate": 7.613768961493582e-06, "loss": 0.2682, "step": 1644 }, { "epoch": 2.590551181102362, "grad_norm": 0.2262151595615975, "learning_rate": 7.584597432905485e-06, "loss": 0.2781, "step": 1645 }, { "epoch": 2.5921259842519686, "grad_norm": 0.19362958689845572, "learning_rate": 7.555425904317387e-06, "loss": 0.2738, "step": 1646 }, { "epoch": 2.5937007874015747, "grad_norm": 0.21672995597860417, "learning_rate": 7.526254375729288e-06, "loss": 0.2586, "step": 1647 }, { "epoch": 2.5952755905511813, "grad_norm": 0.21537887276453865, "learning_rate": 7.49708284714119e-06, "loss": 0.299, "step": 1648 }, { "epoch": 2.5968503937007874, "grad_norm": 0.22890117080757724, "learning_rate": 7.467911318553092e-06, "loss": 0.2818, "step": 1649 }, { "epoch": 2.5984251968503935, "grad_norm": 0.2294575082860338, "learning_rate": 7.438739789964995e-06, "loss": 0.2956, "step": 1650 }, { "epoch": 2.6, "grad_norm": 0.2254424624585562, "learning_rate": 7.409568261376897e-06, "loss": 0.2833, "step": 1651 }, { "epoch": 2.601574803149606, "grad_norm": 0.21350637591445512, "learning_rate": 7.380396732788798e-06, "loss": 0.2735, "step": 1652 }, { "epoch": 2.6031496062992128, "grad_norm": 0.20022580491699735, "learning_rate": 7.3512252042007e-06, "loss": 0.2631, "step": 1653 }, { "epoch": 2.604724409448819, "grad_norm": 0.2145679980699218, "learning_rate": 7.3220536756126025e-06, "loss": 0.2794, "step": 1654 }, { "epoch": 2.606299212598425, "grad_norm": 0.20660615618352554, "learning_rate": 7.2928821470245045e-06, "loss": 0.2791, "step": 1655 }, { "epoch": 2.6078740157480316, "grad_norm": 0.211752695549606, "learning_rate": 7.263710618436406e-06, "loss": 0.2911, "step": 1656 }, { "epoch": 2.6094488188976377, "grad_norm": 0.2090705051809447, "learning_rate": 7.2345390898483075e-06, "loss": 0.2721, "step": 1657 }, { "epoch": 2.6110236220472443, "grad_norm": 0.3099473294901942, "learning_rate": 7.205367561260211e-06, "loss": 0.3309, "step": 1658 }, { "epoch": 2.6125984251968504, "grad_norm": 0.21671025695236715, "learning_rate": 7.176196032672112e-06, "loss": 0.2806, "step": 1659 }, { "epoch": 2.6141732283464565, "grad_norm": 0.20661485389754308, "learning_rate": 7.147024504084014e-06, "loss": 0.2867, "step": 1660 }, { "epoch": 2.615748031496063, "grad_norm": 0.22798922937765315, "learning_rate": 7.117852975495916e-06, "loss": 0.2813, "step": 1661 }, { "epoch": 2.617322834645669, "grad_norm": 0.24168429493687021, "learning_rate": 7.088681446907819e-06, "loss": 0.2911, "step": 1662 }, { "epoch": 2.6188976377952757, "grad_norm": 0.22714546944501837, "learning_rate": 7.059509918319721e-06, "loss": 0.2793, "step": 1663 }, { "epoch": 2.620472440944882, "grad_norm": 0.214983232785899, "learning_rate": 7.030338389731622e-06, "loss": 0.2842, "step": 1664 }, { "epoch": 2.622047244094488, "grad_norm": 0.21453774935462527, "learning_rate": 7.001166861143524e-06, "loss": 0.2924, "step": 1665 }, { "epoch": 2.6236220472440945, "grad_norm": 0.24277644659303307, "learning_rate": 6.9719953325554265e-06, "loss": 0.2813, "step": 1666 }, { "epoch": 2.6251968503937007, "grad_norm": 0.2203788600571817, "learning_rate": 6.9428238039673285e-06, "loss": 0.2832, "step": 1667 }, { "epoch": 2.626771653543307, "grad_norm": 0.20066635403955693, "learning_rate": 6.91365227537923e-06, "loss": 0.2774, "step": 1668 }, { "epoch": 2.6283464566929133, "grad_norm": 0.20742346733681596, "learning_rate": 6.8844807467911315e-06, "loss": 0.2798, "step": 1669 }, { "epoch": 2.6299212598425195, "grad_norm": 0.2204230875383335, "learning_rate": 6.855309218203035e-06, "loss": 0.2875, "step": 1670 }, { "epoch": 2.631496062992126, "grad_norm": 0.2251445725730115, "learning_rate": 6.826137689614936e-06, "loss": 0.2827, "step": 1671 }, { "epoch": 2.633070866141732, "grad_norm": 0.19668770203692276, "learning_rate": 6.796966161026838e-06, "loss": 0.2661, "step": 1672 }, { "epoch": 2.6346456692913387, "grad_norm": 0.21104195058842462, "learning_rate": 6.76779463243874e-06, "loss": 0.2826, "step": 1673 }, { "epoch": 2.636220472440945, "grad_norm": 0.20367060729033365, "learning_rate": 6.738623103850641e-06, "loss": 0.2798, "step": 1674 }, { "epoch": 2.637795275590551, "grad_norm": 0.20530097788620866, "learning_rate": 6.709451575262545e-06, "loss": 0.2731, "step": 1675 }, { "epoch": 2.6393700787401575, "grad_norm": 0.202297624814911, "learning_rate": 6.680280046674446e-06, "loss": 0.2743, "step": 1676 }, { "epoch": 2.6409448818897636, "grad_norm": 0.19943148103832553, "learning_rate": 6.651108518086348e-06, "loss": 0.2719, "step": 1677 }, { "epoch": 2.64251968503937, "grad_norm": 0.22854938157225835, "learning_rate": 6.62193698949825e-06, "loss": 0.2895, "step": 1678 }, { "epoch": 2.6440944881889763, "grad_norm": 0.1971449578639655, "learning_rate": 6.5927654609101524e-06, "loss": 0.2708, "step": 1679 }, { "epoch": 2.6456692913385824, "grad_norm": 0.20734618138601857, "learning_rate": 6.563593932322054e-06, "loss": 0.2745, "step": 1680 }, { "epoch": 2.647244094488189, "grad_norm": 0.20071867398792828, "learning_rate": 6.5344224037339554e-06, "loss": 0.2719, "step": 1681 }, { "epoch": 2.6488188976377955, "grad_norm": 0.2240641561190218, "learning_rate": 6.505250875145857e-06, "loss": 0.2765, "step": 1682 }, { "epoch": 2.6503937007874017, "grad_norm": 0.21327792653058425, "learning_rate": 6.47607934655776e-06, "loss": 0.2922, "step": 1683 }, { "epoch": 2.651968503937008, "grad_norm": 0.2022535945230588, "learning_rate": 6.446907817969662e-06, "loss": 0.2738, "step": 1684 }, { "epoch": 2.653543307086614, "grad_norm": 0.21237545158370408, "learning_rate": 6.417736289381564e-06, "loss": 0.2721, "step": 1685 }, { "epoch": 2.6551181102362205, "grad_norm": 0.2158321193158523, "learning_rate": 6.388564760793465e-06, "loss": 0.2793, "step": 1686 }, { "epoch": 2.656692913385827, "grad_norm": 0.2117319913774414, "learning_rate": 6.359393232205369e-06, "loss": 0.2754, "step": 1687 }, { "epoch": 2.658267716535433, "grad_norm": 0.21043605291526277, "learning_rate": 6.33022170361727e-06, "loss": 0.282, "step": 1688 }, { "epoch": 2.6598425196850393, "grad_norm": 0.20838328513267426, "learning_rate": 6.301050175029172e-06, "loss": 0.2862, "step": 1689 }, { "epoch": 2.661417322834646, "grad_norm": 0.2106080552917318, "learning_rate": 6.271878646441074e-06, "loss": 0.2863, "step": 1690 }, { "epoch": 2.662992125984252, "grad_norm": 0.20813981381567953, "learning_rate": 6.2427071178529756e-06, "loss": 0.2767, "step": 1691 }, { "epoch": 2.6645669291338585, "grad_norm": 0.22184922341889143, "learning_rate": 6.213535589264878e-06, "loss": 0.292, "step": 1692 }, { "epoch": 2.6661417322834646, "grad_norm": 0.19812826762913502, "learning_rate": 6.1843640606767794e-06, "loss": 0.29, "step": 1693 }, { "epoch": 2.6677165354330707, "grad_norm": 0.206579566236018, "learning_rate": 6.155192532088682e-06, "loss": 0.2824, "step": 1694 }, { "epoch": 2.6692913385826773, "grad_norm": 0.2306305967276582, "learning_rate": 6.126021003500583e-06, "loss": 0.277, "step": 1695 }, { "epoch": 2.6708661417322834, "grad_norm": 0.24442852084154393, "learning_rate": 6.096849474912486e-06, "loss": 0.2971, "step": 1696 }, { "epoch": 2.67244094488189, "grad_norm": 0.1979569471750913, "learning_rate": 6.067677946324388e-06, "loss": 0.2862, "step": 1697 }, { "epoch": 2.674015748031496, "grad_norm": 0.2094687689049904, "learning_rate": 6.038506417736289e-06, "loss": 0.2806, "step": 1698 }, { "epoch": 2.6755905511811022, "grad_norm": 0.23137282321852815, "learning_rate": 6.009334889148192e-06, "loss": 0.2961, "step": 1699 }, { "epoch": 2.677165354330709, "grad_norm": 0.22951274720820297, "learning_rate": 5.980163360560093e-06, "loss": 0.2722, "step": 1700 }, { "epoch": 2.678740157480315, "grad_norm": 0.21700194276168538, "learning_rate": 5.950991831971996e-06, "loss": 0.2699, "step": 1701 }, { "epoch": 2.6803149606299215, "grad_norm": 0.210561785647921, "learning_rate": 5.921820303383898e-06, "loss": 0.2783, "step": 1702 }, { "epoch": 2.6818897637795276, "grad_norm": 0.2253477268212324, "learning_rate": 5.8926487747957996e-06, "loss": 0.2964, "step": 1703 }, { "epoch": 2.6834645669291337, "grad_norm": 0.21421998152632737, "learning_rate": 5.8634772462077015e-06, "loss": 0.2887, "step": 1704 }, { "epoch": 2.6850393700787403, "grad_norm": 0.24740506562380782, "learning_rate": 5.834305717619603e-06, "loss": 0.2832, "step": 1705 }, { "epoch": 2.6866141732283464, "grad_norm": 0.20085100248548934, "learning_rate": 5.805134189031505e-06, "loss": 0.279, "step": 1706 }, { "epoch": 2.688188976377953, "grad_norm": 0.22008086310440117, "learning_rate": 5.775962660443407e-06, "loss": 0.2846, "step": 1707 }, { "epoch": 2.689763779527559, "grad_norm": 0.20241222373785478, "learning_rate": 5.746791131855309e-06, "loss": 0.2802, "step": 1708 }, { "epoch": 2.691338582677165, "grad_norm": 0.19762155668110465, "learning_rate": 5.717619603267212e-06, "loss": 0.2649, "step": 1709 }, { "epoch": 2.6929133858267718, "grad_norm": 0.20718702389791838, "learning_rate": 5.688448074679113e-06, "loss": 0.2802, "step": 1710 }, { "epoch": 2.694488188976378, "grad_norm": 0.21338636453416085, "learning_rate": 5.659276546091016e-06, "loss": 0.283, "step": 1711 }, { "epoch": 2.6960629921259844, "grad_norm": 0.198041411472455, "learning_rate": 5.630105017502917e-06, "loss": 0.2699, "step": 1712 }, { "epoch": 2.6976377952755906, "grad_norm": 0.20791436320339482, "learning_rate": 5.60093348891482e-06, "loss": 0.2891, "step": 1713 }, { "epoch": 2.6992125984251967, "grad_norm": 0.2121554049352364, "learning_rate": 5.571761960326722e-06, "loss": 0.2932, "step": 1714 }, { "epoch": 2.7007874015748032, "grad_norm": 0.22901460013390876, "learning_rate": 5.5425904317386235e-06, "loss": 0.2738, "step": 1715 }, { "epoch": 2.7023622047244094, "grad_norm": 0.22174640956570377, "learning_rate": 5.5134189031505255e-06, "loss": 0.2875, "step": 1716 }, { "epoch": 2.703937007874016, "grad_norm": 0.20164179259986442, "learning_rate": 5.484247374562427e-06, "loss": 0.2751, "step": 1717 }, { "epoch": 2.705511811023622, "grad_norm": 0.2060984197744457, "learning_rate": 5.455075845974329e-06, "loss": 0.2838, "step": 1718 }, { "epoch": 2.707086614173228, "grad_norm": 0.213013578740281, "learning_rate": 5.425904317386231e-06, "loss": 0.2713, "step": 1719 }, { "epoch": 2.7086614173228347, "grad_norm": 0.21607269697392192, "learning_rate": 5.396732788798133e-06, "loss": 0.2728, "step": 1720 }, { "epoch": 2.710236220472441, "grad_norm": 0.24156879815351706, "learning_rate": 5.367561260210036e-06, "loss": 0.2983, "step": 1721 }, { "epoch": 2.7118110236220474, "grad_norm": 0.2071726086769826, "learning_rate": 5.338389731621937e-06, "loss": 0.278, "step": 1722 }, { "epoch": 2.7133858267716535, "grad_norm": 0.21378513175236885, "learning_rate": 5.30921820303384e-06, "loss": 0.2865, "step": 1723 }, { "epoch": 2.7149606299212596, "grad_norm": 0.1962436204460089, "learning_rate": 5.280046674445741e-06, "loss": 0.2737, "step": 1724 }, { "epoch": 2.716535433070866, "grad_norm": 0.21546524454985036, "learning_rate": 5.250875145857643e-06, "loss": 0.2834, "step": 1725 }, { "epoch": 2.7181102362204723, "grad_norm": 0.21666564875120609, "learning_rate": 5.221703617269546e-06, "loss": 0.2813, "step": 1726 }, { "epoch": 2.719685039370079, "grad_norm": 0.21177418883603136, "learning_rate": 5.192532088681447e-06, "loss": 0.2729, "step": 1727 }, { "epoch": 2.721259842519685, "grad_norm": 0.1955842774145678, "learning_rate": 5.1633605600933494e-06, "loss": 0.2797, "step": 1728 }, { "epoch": 2.722834645669291, "grad_norm": 0.20875297089682238, "learning_rate": 5.1341890315052505e-06, "loss": 0.2862, "step": 1729 }, { "epoch": 2.7244094488188977, "grad_norm": 0.20395351225053265, "learning_rate": 5.105017502917153e-06, "loss": 0.2858, "step": 1730 }, { "epoch": 2.725984251968504, "grad_norm": 0.21956488833346696, "learning_rate": 5.075845974329055e-06, "loss": 0.2914, "step": 1731 }, { "epoch": 2.7275590551181104, "grad_norm": 0.22168707058283493, "learning_rate": 5.046674445740957e-06, "loss": 0.2778, "step": 1732 }, { "epoch": 2.7291338582677165, "grad_norm": 0.20004206552154458, "learning_rate": 5.017502917152859e-06, "loss": 0.2731, "step": 1733 }, { "epoch": 2.7307086614173226, "grad_norm": 0.20468324057319823, "learning_rate": 4.988331388564761e-06, "loss": 0.2809, "step": 1734 }, { "epoch": 2.732283464566929, "grad_norm": 0.2024065366619702, "learning_rate": 4.959159859976663e-06, "loss": 0.2613, "step": 1735 }, { "epoch": 2.7338582677165353, "grad_norm": 0.23853330545074797, "learning_rate": 4.929988331388565e-06, "loss": 0.2788, "step": 1736 }, { "epoch": 2.735433070866142, "grad_norm": 0.20916761330909484, "learning_rate": 4.900816802800467e-06, "loss": 0.2754, "step": 1737 }, { "epoch": 2.737007874015748, "grad_norm": 0.22165463105819594, "learning_rate": 4.8716452742123696e-06, "loss": 0.2896, "step": 1738 }, { "epoch": 2.738582677165354, "grad_norm": 0.22428701999866565, "learning_rate": 4.842473745624271e-06, "loss": 0.3006, "step": 1739 }, { "epoch": 2.7401574803149606, "grad_norm": 0.23998684348809454, "learning_rate": 4.8133022170361734e-06, "loss": 0.2835, "step": 1740 }, { "epoch": 2.7417322834645668, "grad_norm": 0.2015355743831216, "learning_rate": 4.7841306884480745e-06, "loss": 0.272, "step": 1741 }, { "epoch": 2.7433070866141733, "grad_norm": 0.2076871982812134, "learning_rate": 4.754959159859977e-06, "loss": 0.2834, "step": 1742 }, { "epoch": 2.7448818897637794, "grad_norm": 0.2124039742964504, "learning_rate": 4.725787631271879e-06, "loss": 0.2925, "step": 1743 }, { "epoch": 2.7464566929133856, "grad_norm": 0.20697438894322767, "learning_rate": 4.696616102683781e-06, "loss": 0.2786, "step": 1744 }, { "epoch": 2.748031496062992, "grad_norm": 0.20810890923941625, "learning_rate": 4.667444574095683e-06, "loss": 0.2793, "step": 1745 }, { "epoch": 2.7496062992125987, "grad_norm": 0.21572843579692905, "learning_rate": 4.638273045507585e-06, "loss": 0.2736, "step": 1746 }, { "epoch": 2.751181102362205, "grad_norm": 0.2071981090741202, "learning_rate": 4.609101516919487e-06, "loss": 0.287, "step": 1747 }, { "epoch": 2.752755905511811, "grad_norm": 0.21238222469501924, "learning_rate": 4.579929988331389e-06, "loss": 0.2794, "step": 1748 }, { "epoch": 2.754330708661417, "grad_norm": 0.20353919752933708, "learning_rate": 4.550758459743291e-06, "loss": 0.2923, "step": 1749 }, { "epoch": 2.7559055118110236, "grad_norm": 0.19664582036445172, "learning_rate": 4.521586931155193e-06, "loss": 0.2684, "step": 1750 }, { "epoch": 2.75748031496063, "grad_norm": 0.20609028217132552, "learning_rate": 4.492415402567095e-06, "loss": 0.2944, "step": 1751 }, { "epoch": 2.7590551181102363, "grad_norm": 0.2089187519260169, "learning_rate": 4.4632438739789966e-06, "loss": 0.2835, "step": 1752 }, { "epoch": 2.7606299212598424, "grad_norm": 0.21525238270333372, "learning_rate": 4.4340723453908985e-06, "loss": 0.2924, "step": 1753 }, { "epoch": 2.762204724409449, "grad_norm": 0.22058686559705457, "learning_rate": 4.4049008168028e-06, "loss": 0.2821, "step": 1754 }, { "epoch": 2.763779527559055, "grad_norm": 0.21568115612140268, "learning_rate": 4.375729288214702e-06, "loss": 0.2878, "step": 1755 }, { "epoch": 2.7653543307086617, "grad_norm": 0.2125915806030921, "learning_rate": 4.346557759626604e-06, "loss": 0.2746, "step": 1756 }, { "epoch": 2.7669291338582678, "grad_norm": 0.22268914681376312, "learning_rate": 4.317386231038507e-06, "loss": 0.2886, "step": 1757 }, { "epoch": 2.768503937007874, "grad_norm": 0.22559345675188947, "learning_rate": 4.288214702450408e-06, "loss": 0.283, "step": 1758 }, { "epoch": 2.7700787401574805, "grad_norm": 0.23600215451298595, "learning_rate": 4.259043173862311e-06, "loss": 0.2952, "step": 1759 }, { "epoch": 2.7716535433070866, "grad_norm": 0.1945853204180303, "learning_rate": 4.229871645274212e-06, "loss": 0.2783, "step": 1760 }, { "epoch": 2.773228346456693, "grad_norm": 0.20680551323196203, "learning_rate": 4.200700116686115e-06, "loss": 0.2771, "step": 1761 }, { "epoch": 2.7748031496062993, "grad_norm": 0.21548532067264137, "learning_rate": 4.171528588098017e-06, "loss": 0.2904, "step": 1762 }, { "epoch": 2.7763779527559054, "grad_norm": 0.21685820788851465, "learning_rate": 4.142357059509919e-06, "loss": 0.2756, "step": 1763 }, { "epoch": 2.777952755905512, "grad_norm": 0.2162988139868769, "learning_rate": 4.1131855309218205e-06, "loss": 0.2756, "step": 1764 }, { "epoch": 2.779527559055118, "grad_norm": 0.19708011337007894, "learning_rate": 4.0840140023337225e-06, "loss": 0.2739, "step": 1765 }, { "epoch": 2.7811023622047246, "grad_norm": 0.2363039154054021, "learning_rate": 4.054842473745624e-06, "loss": 0.2672, "step": 1766 }, { "epoch": 2.7826771653543307, "grad_norm": 0.1897726969574424, "learning_rate": 4.025670945157526e-06, "loss": 0.2783, "step": 1767 }, { "epoch": 2.784251968503937, "grad_norm": 0.20466863269095303, "learning_rate": 3.996499416569428e-06, "loss": 0.2676, "step": 1768 }, { "epoch": 2.7858267716535434, "grad_norm": 0.44859488541237774, "learning_rate": 3.967327887981331e-06, "loss": 0.2894, "step": 1769 }, { "epoch": 2.7874015748031495, "grad_norm": 0.2070680339769268, "learning_rate": 3.938156359393232e-06, "loss": 0.2733, "step": 1770 }, { "epoch": 2.788976377952756, "grad_norm": 0.21434119163483806, "learning_rate": 3.908984830805135e-06, "loss": 0.2835, "step": 1771 }, { "epoch": 2.790551181102362, "grad_norm": 0.19759898099627038, "learning_rate": 3.879813302217036e-06, "loss": 0.2692, "step": 1772 }, { "epoch": 2.7921259842519683, "grad_norm": 0.1953844028614356, "learning_rate": 3.850641773628939e-06, "loss": 0.2699, "step": 1773 }, { "epoch": 2.793700787401575, "grad_norm": 0.19332186071919769, "learning_rate": 3.821470245040841e-06, "loss": 0.267, "step": 1774 }, { "epoch": 2.795275590551181, "grad_norm": 0.2119442712153159, "learning_rate": 3.7922987164527426e-06, "loss": 0.2834, "step": 1775 }, { "epoch": 2.7968503937007876, "grad_norm": 0.20922614747928278, "learning_rate": 3.763127187864644e-06, "loss": 0.2838, "step": 1776 }, { "epoch": 2.7984251968503937, "grad_norm": 0.22518843059538765, "learning_rate": 3.733955659276546e-06, "loss": 0.2743, "step": 1777 }, { "epoch": 2.8, "grad_norm": 0.21214891644714276, "learning_rate": 3.7047841306884484e-06, "loss": 0.2675, "step": 1778 }, { "epoch": 2.8015748031496064, "grad_norm": 0.1906851701246333, "learning_rate": 3.67561260210035e-06, "loss": 0.2571, "step": 1779 }, { "epoch": 2.8031496062992125, "grad_norm": 0.22326111149869843, "learning_rate": 3.6464410735122522e-06, "loss": 0.2677, "step": 1780 }, { "epoch": 2.804724409448819, "grad_norm": 0.22738134685284736, "learning_rate": 3.6172695449241537e-06, "loss": 0.2789, "step": 1781 }, { "epoch": 2.806299212598425, "grad_norm": 0.21470520379774055, "learning_rate": 3.588098016336056e-06, "loss": 0.289, "step": 1782 }, { "epoch": 2.8078740157480313, "grad_norm": 0.20304656464350604, "learning_rate": 3.558926487747958e-06, "loss": 0.2753, "step": 1783 }, { "epoch": 2.809448818897638, "grad_norm": 0.2078011837743968, "learning_rate": 3.5297549591598604e-06, "loss": 0.2851, "step": 1784 }, { "epoch": 2.811023622047244, "grad_norm": 0.22502153632114633, "learning_rate": 3.500583430571762e-06, "loss": 0.2922, "step": 1785 }, { "epoch": 2.8125984251968505, "grad_norm": 0.23264388325874868, "learning_rate": 3.4714119019836642e-06, "loss": 0.2863, "step": 1786 }, { "epoch": 2.8141732283464567, "grad_norm": 0.2020660913140766, "learning_rate": 3.4422403733955657e-06, "loss": 0.2839, "step": 1787 }, { "epoch": 2.815748031496063, "grad_norm": 0.19908275057226013, "learning_rate": 3.413068844807468e-06, "loss": 0.2733, "step": 1788 }, { "epoch": 2.8173228346456693, "grad_norm": 0.19767312585448701, "learning_rate": 3.38389731621937e-06, "loss": 0.282, "step": 1789 }, { "epoch": 2.8188976377952755, "grad_norm": 0.1939983317463328, "learning_rate": 3.3547257876312724e-06, "loss": 0.2776, "step": 1790 }, { "epoch": 2.820472440944882, "grad_norm": 0.20809888970113566, "learning_rate": 3.325554259043174e-06, "loss": 0.2688, "step": 1791 }, { "epoch": 2.822047244094488, "grad_norm": 0.19601245637624196, "learning_rate": 3.2963827304550762e-06, "loss": 0.2733, "step": 1792 }, { "epoch": 2.8236220472440943, "grad_norm": 0.21774004396824914, "learning_rate": 3.2672112018669777e-06, "loss": 0.2786, "step": 1793 }, { "epoch": 2.825196850393701, "grad_norm": 0.20774217983850465, "learning_rate": 3.23803967327888e-06, "loss": 0.2911, "step": 1794 }, { "epoch": 2.826771653543307, "grad_norm": 0.18448586891454247, "learning_rate": 3.208868144690782e-06, "loss": 0.2655, "step": 1795 }, { "epoch": 2.8283464566929135, "grad_norm": 0.20136317572397924, "learning_rate": 3.1796966161026844e-06, "loss": 0.2813, "step": 1796 }, { "epoch": 2.8299212598425196, "grad_norm": 0.18813663190122062, "learning_rate": 3.150525087514586e-06, "loss": 0.2777, "step": 1797 }, { "epoch": 2.8314960629921258, "grad_norm": 0.19840976189749598, "learning_rate": 3.1213535589264878e-06, "loss": 0.268, "step": 1798 }, { "epoch": 2.8330708661417323, "grad_norm": 0.2644366370341188, "learning_rate": 3.0921820303383897e-06, "loss": 0.28, "step": 1799 }, { "epoch": 2.8346456692913384, "grad_norm": 0.19593221696733532, "learning_rate": 3.0630105017502916e-06, "loss": 0.2652, "step": 1800 }, { "epoch": 2.836220472440945, "grad_norm": 0.2097557641875297, "learning_rate": 3.033838973162194e-06, "loss": 0.2924, "step": 1801 }, { "epoch": 2.837795275590551, "grad_norm": 0.19605873198798685, "learning_rate": 3.004667444574096e-06, "loss": 0.274, "step": 1802 }, { "epoch": 2.8393700787401572, "grad_norm": 0.19524255099110152, "learning_rate": 2.975495915985998e-06, "loss": 0.274, "step": 1803 }, { "epoch": 2.840944881889764, "grad_norm": 0.20028631795172644, "learning_rate": 2.9463243873978998e-06, "loss": 0.2786, "step": 1804 }, { "epoch": 2.84251968503937, "grad_norm": 0.20582455221573417, "learning_rate": 2.9171528588098017e-06, "loss": 0.2738, "step": 1805 }, { "epoch": 2.8440944881889765, "grad_norm": 0.20030259893813018, "learning_rate": 2.8879813302217036e-06, "loss": 0.2895, "step": 1806 }, { "epoch": 2.8456692913385826, "grad_norm": 0.19123367599131327, "learning_rate": 2.858809801633606e-06, "loss": 0.2746, "step": 1807 }, { "epoch": 2.8472440944881887, "grad_norm": 0.20497004151781498, "learning_rate": 2.829638273045508e-06, "loss": 0.2836, "step": 1808 }, { "epoch": 2.8488188976377953, "grad_norm": 0.20560137336471715, "learning_rate": 2.80046674445741e-06, "loss": 0.2715, "step": 1809 }, { "epoch": 2.850393700787402, "grad_norm": 0.19918188627586708, "learning_rate": 2.7712952158693118e-06, "loss": 0.2833, "step": 1810 }, { "epoch": 2.851968503937008, "grad_norm": 0.20896980290233635, "learning_rate": 2.7421236872812137e-06, "loss": 0.2636, "step": 1811 }, { "epoch": 2.853543307086614, "grad_norm": 0.19215344096622677, "learning_rate": 2.7129521586931156e-06, "loss": 0.2795, "step": 1812 }, { "epoch": 2.85511811023622, "grad_norm": 0.19917978887413854, "learning_rate": 2.683780630105018e-06, "loss": 0.2698, "step": 1813 }, { "epoch": 2.8566929133858268, "grad_norm": 0.1880976769683847, "learning_rate": 2.65460910151692e-06, "loss": 0.2721, "step": 1814 }, { "epoch": 2.8582677165354333, "grad_norm": 0.2017716883633975, "learning_rate": 2.6254375729288214e-06, "loss": 0.2771, "step": 1815 }, { "epoch": 2.8598425196850394, "grad_norm": 0.19287158572751978, "learning_rate": 2.5962660443407233e-06, "loss": 0.2826, "step": 1816 }, { "epoch": 2.8614173228346456, "grad_norm": 0.22140550234146456, "learning_rate": 2.5670945157526253e-06, "loss": 0.2953, "step": 1817 }, { "epoch": 2.862992125984252, "grad_norm": 0.2136065413224161, "learning_rate": 2.5379229871645276e-06, "loss": 0.2811, "step": 1818 }, { "epoch": 2.8645669291338582, "grad_norm": 0.18141759203392935, "learning_rate": 2.5087514585764295e-06, "loss": 0.2705, "step": 1819 }, { "epoch": 2.866141732283465, "grad_norm": 0.2055595783930763, "learning_rate": 2.4795799299883315e-06, "loss": 0.2792, "step": 1820 }, { "epoch": 2.867716535433071, "grad_norm": 0.20494951740323974, "learning_rate": 2.4504084014002334e-06, "loss": 0.2814, "step": 1821 }, { "epoch": 2.869291338582677, "grad_norm": 0.1925770480858743, "learning_rate": 2.4212368728121353e-06, "loss": 0.2719, "step": 1822 }, { "epoch": 2.8708661417322836, "grad_norm": 0.1977308688694594, "learning_rate": 2.3920653442240373e-06, "loss": 0.281, "step": 1823 }, { "epoch": 2.8724409448818897, "grad_norm": 0.19977768405245896, "learning_rate": 2.3628938156359396e-06, "loss": 0.2879, "step": 1824 }, { "epoch": 2.8740157480314963, "grad_norm": 0.2094401582786621, "learning_rate": 2.3337222870478415e-06, "loss": 0.2692, "step": 1825 }, { "epoch": 2.8755905511811024, "grad_norm": 0.20718538644398943, "learning_rate": 2.3045507584597435e-06, "loss": 0.2763, "step": 1826 }, { "epoch": 2.8771653543307085, "grad_norm": 0.19976964039851455, "learning_rate": 2.2753792298716454e-06, "loss": 0.2875, "step": 1827 }, { "epoch": 2.878740157480315, "grad_norm": 0.20460205746647023, "learning_rate": 2.2462077012835473e-06, "loss": 0.2839, "step": 1828 }, { "epoch": 2.880314960629921, "grad_norm": 0.20022310460059162, "learning_rate": 2.2170361726954492e-06, "loss": 0.2916, "step": 1829 }, { "epoch": 2.8818897637795278, "grad_norm": 0.19107460939568016, "learning_rate": 2.187864644107351e-06, "loss": 0.2778, "step": 1830 }, { "epoch": 2.883464566929134, "grad_norm": 0.1815901961901926, "learning_rate": 2.1586931155192535e-06, "loss": 0.2632, "step": 1831 }, { "epoch": 2.88503937007874, "grad_norm": 0.18665846430070265, "learning_rate": 2.1295215869311555e-06, "loss": 0.2671, "step": 1832 }, { "epoch": 2.8866141732283466, "grad_norm": 0.18707060484601357, "learning_rate": 2.1003500583430574e-06, "loss": 0.274, "step": 1833 }, { "epoch": 2.8881889763779527, "grad_norm": 0.21250476023900536, "learning_rate": 2.0711785297549593e-06, "loss": 0.2988, "step": 1834 }, { "epoch": 2.8897637795275593, "grad_norm": 0.2034003789684784, "learning_rate": 2.0420070011668612e-06, "loss": 0.2945, "step": 1835 }, { "epoch": 2.8913385826771654, "grad_norm": 0.19847442567033616, "learning_rate": 2.012835472578763e-06, "loss": 0.2814, "step": 1836 }, { "epoch": 2.8929133858267715, "grad_norm": 0.19350470562385816, "learning_rate": 1.9836639439906655e-06, "loss": 0.2702, "step": 1837 }, { "epoch": 2.894488188976378, "grad_norm": 0.20906705527411237, "learning_rate": 1.9544924154025674e-06, "loss": 0.2814, "step": 1838 }, { "epoch": 2.896062992125984, "grad_norm": 0.19031750232210132, "learning_rate": 1.9253208868144694e-06, "loss": 0.2884, "step": 1839 }, { "epoch": 2.8976377952755907, "grad_norm": 0.19530555524689153, "learning_rate": 1.8961493582263713e-06, "loss": 0.2827, "step": 1840 }, { "epoch": 2.899212598425197, "grad_norm": 0.19407881126746568, "learning_rate": 1.866977829638273e-06, "loss": 0.276, "step": 1841 }, { "epoch": 2.900787401574803, "grad_norm": 0.1957264707880579, "learning_rate": 1.837806301050175e-06, "loss": 0.291, "step": 1842 }, { "epoch": 2.9023622047244095, "grad_norm": 0.2016543364706638, "learning_rate": 1.8086347724620769e-06, "loss": 0.2712, "step": 1843 }, { "epoch": 2.9039370078740157, "grad_norm": 0.18768897176390203, "learning_rate": 1.779463243873979e-06, "loss": 0.2734, "step": 1844 }, { "epoch": 2.905511811023622, "grad_norm": 0.19697997185100885, "learning_rate": 1.750291715285881e-06, "loss": 0.2803, "step": 1845 }, { "epoch": 2.9070866141732283, "grad_norm": 0.18922870776046496, "learning_rate": 1.7211201866977829e-06, "loss": 0.2648, "step": 1846 }, { "epoch": 2.9086614173228345, "grad_norm": 0.19841974223688122, "learning_rate": 1.691948658109685e-06, "loss": 0.2686, "step": 1847 }, { "epoch": 2.910236220472441, "grad_norm": 0.19017484506961282, "learning_rate": 1.662777129521587e-06, "loss": 0.2675, "step": 1848 }, { "epoch": 2.911811023622047, "grad_norm": 0.1892870919748703, "learning_rate": 1.6336056009334889e-06, "loss": 0.2792, "step": 1849 }, { "epoch": 2.9133858267716537, "grad_norm": 0.19622250493866633, "learning_rate": 1.604434072345391e-06, "loss": 0.2843, "step": 1850 }, { "epoch": 2.91496062992126, "grad_norm": 0.18820458327867798, "learning_rate": 1.575262543757293e-06, "loss": 0.2628, "step": 1851 }, { "epoch": 2.916535433070866, "grad_norm": 0.19660384135781575, "learning_rate": 1.5460910151691949e-06, "loss": 0.2856, "step": 1852 }, { "epoch": 2.9181102362204725, "grad_norm": 0.19185338894575818, "learning_rate": 1.516919486581097e-06, "loss": 0.2741, "step": 1853 }, { "epoch": 2.9196850393700786, "grad_norm": 0.20179334037255436, "learning_rate": 1.487747957992999e-06, "loss": 0.2882, "step": 1854 }, { "epoch": 2.921259842519685, "grad_norm": 0.19339040352713988, "learning_rate": 1.4585764294049009e-06, "loss": 0.301, "step": 1855 }, { "epoch": 2.9228346456692913, "grad_norm": 0.1945134500511514, "learning_rate": 1.429404900816803e-06, "loss": 0.2784, "step": 1856 }, { "epoch": 2.9244094488188974, "grad_norm": 0.18725380099600406, "learning_rate": 1.400233372228705e-06, "loss": 0.2712, "step": 1857 }, { "epoch": 2.925984251968504, "grad_norm": 0.1973907525829502, "learning_rate": 1.3710618436406068e-06, "loss": 0.263, "step": 1858 }, { "epoch": 2.92755905511811, "grad_norm": 0.19992545596252434, "learning_rate": 1.341890315052509e-06, "loss": 0.2734, "step": 1859 }, { "epoch": 2.9291338582677167, "grad_norm": 0.19627263607992604, "learning_rate": 1.3127187864644107e-06, "loss": 0.2896, "step": 1860 }, { "epoch": 2.930708661417323, "grad_norm": 0.21206500418528956, "learning_rate": 1.2835472578763126e-06, "loss": 0.2798, "step": 1861 }, { "epoch": 2.932283464566929, "grad_norm": 0.18481227231652295, "learning_rate": 1.2543757292882148e-06, "loss": 0.2531, "step": 1862 }, { "epoch": 2.9338582677165355, "grad_norm": 0.18648325134480237, "learning_rate": 1.2252042007001167e-06, "loss": 0.2729, "step": 1863 }, { "epoch": 2.9354330708661416, "grad_norm": 0.18344059813033636, "learning_rate": 1.1960326721120186e-06, "loss": 0.2605, "step": 1864 }, { "epoch": 2.937007874015748, "grad_norm": 0.20472148246162944, "learning_rate": 1.1668611435239208e-06, "loss": 0.2837, "step": 1865 }, { "epoch": 2.9385826771653543, "grad_norm": 0.19841176974406335, "learning_rate": 1.1376896149358227e-06, "loss": 0.2762, "step": 1866 }, { "epoch": 2.9401574803149604, "grad_norm": 0.1997267058261043, "learning_rate": 1.1085180863477246e-06, "loss": 0.2786, "step": 1867 }, { "epoch": 2.941732283464567, "grad_norm": 0.21628974559022343, "learning_rate": 1.0793465577596268e-06, "loss": 0.2917, "step": 1868 }, { "epoch": 2.943307086614173, "grad_norm": 0.1856653656729586, "learning_rate": 1.0501750291715287e-06, "loss": 0.2701, "step": 1869 }, { "epoch": 2.9448818897637796, "grad_norm": 0.18599343460083642, "learning_rate": 1.0210035005834306e-06, "loss": 0.2752, "step": 1870 }, { "epoch": 2.9464566929133857, "grad_norm": 0.18655906170410472, "learning_rate": 9.918319719953328e-07, "loss": 0.2725, "step": 1871 }, { "epoch": 2.948031496062992, "grad_norm": 0.18592652895035605, "learning_rate": 9.626604434072347e-07, "loss": 0.2715, "step": 1872 }, { "epoch": 2.9496062992125984, "grad_norm": 0.1949195061816229, "learning_rate": 9.334889148191365e-07, "loss": 0.2786, "step": 1873 }, { "epoch": 2.951181102362205, "grad_norm": 0.19086541193131223, "learning_rate": 9.043173862310384e-07, "loss": 0.2705, "step": 1874 }, { "epoch": 2.952755905511811, "grad_norm": 0.21581605447987753, "learning_rate": 8.751458576429405e-07, "loss": 0.2922, "step": 1875 }, { "epoch": 2.9543307086614172, "grad_norm": 0.19048489286002826, "learning_rate": 8.459743290548425e-07, "loss": 0.2708, "step": 1876 }, { "epoch": 2.9559055118110233, "grad_norm": 0.1922280138806289, "learning_rate": 8.168028004667444e-07, "loss": 0.266, "step": 1877 }, { "epoch": 2.95748031496063, "grad_norm": 0.1951545997085052, "learning_rate": 7.876312718786465e-07, "loss": 0.2881, "step": 1878 }, { "epoch": 2.9590551181102365, "grad_norm": 0.18535287489493232, "learning_rate": 7.584597432905485e-07, "loss": 0.2711, "step": 1879 }, { "epoch": 2.9606299212598426, "grad_norm": 0.387537052754549, "learning_rate": 7.292882147024504e-07, "loss": 0.2986, "step": 1880 }, { "epoch": 2.9622047244094487, "grad_norm": 0.19227405380326612, "learning_rate": 7.001166861143525e-07, "loss": 0.2647, "step": 1881 }, { "epoch": 2.9637795275590553, "grad_norm": 0.20088942991907102, "learning_rate": 6.709451575262545e-07, "loss": 0.2727, "step": 1882 }, { "epoch": 2.9653543307086614, "grad_norm": 0.19103538183909105, "learning_rate": 6.417736289381563e-07, "loss": 0.2777, "step": 1883 }, { "epoch": 2.966929133858268, "grad_norm": 0.2038042728466092, "learning_rate": 6.126021003500583e-07, "loss": 0.2886, "step": 1884 }, { "epoch": 2.968503937007874, "grad_norm": 0.19114902342895262, "learning_rate": 5.834305717619604e-07, "loss": 0.2872, "step": 1885 }, { "epoch": 2.97007874015748, "grad_norm": 0.18108221817813297, "learning_rate": 5.542590431738623e-07, "loss": 0.2689, "step": 1886 }, { "epoch": 2.9716535433070868, "grad_norm": 0.1890084625552301, "learning_rate": 5.250875145857643e-07, "loss": 0.2741, "step": 1887 }, { "epoch": 2.973228346456693, "grad_norm": 0.1904396833393843, "learning_rate": 4.959159859976664e-07, "loss": 0.2817, "step": 1888 }, { "epoch": 2.9748031496062994, "grad_norm": 0.20314405297454396, "learning_rate": 4.6674445740956825e-07, "loss": 0.2702, "step": 1889 }, { "epoch": 2.9763779527559056, "grad_norm": 0.1957757400584214, "learning_rate": 4.3757292882147023e-07, "loss": 0.2707, "step": 1890 }, { "epoch": 2.9779527559055117, "grad_norm": 0.20120854726054124, "learning_rate": 4.084014002333722e-07, "loss": 0.2855, "step": 1891 }, { "epoch": 2.9795275590551182, "grad_norm": 0.21225938863080834, "learning_rate": 3.7922987164527425e-07, "loss": 0.2805, "step": 1892 }, { "epoch": 2.9811023622047244, "grad_norm": 0.20979380339287196, "learning_rate": 3.5005834305717623e-07, "loss": 0.2875, "step": 1893 }, { "epoch": 2.982677165354331, "grad_norm": 0.20013282382862377, "learning_rate": 3.2088681446907816e-07, "loss": 0.2897, "step": 1894 }, { "epoch": 2.984251968503937, "grad_norm": 0.20526637807686748, "learning_rate": 2.917152858809802e-07, "loss": 0.3008, "step": 1895 }, { "epoch": 2.985826771653543, "grad_norm": 0.19334951507252754, "learning_rate": 2.6254375729288217e-07, "loss": 0.2785, "step": 1896 }, { "epoch": 2.9874015748031497, "grad_norm": 0.19115207497405165, "learning_rate": 2.3337222870478413e-07, "loss": 0.2696, "step": 1897 }, { "epoch": 2.988976377952756, "grad_norm": 0.18598193897106904, "learning_rate": 2.042007001166861e-07, "loss": 0.279, "step": 1898 }, { "epoch": 2.9905511811023624, "grad_norm": 0.20090536994602645, "learning_rate": 1.7502917152858811e-07, "loss": 0.2832, "step": 1899 }, { "epoch": 2.9921259842519685, "grad_norm": 0.18961482749811268, "learning_rate": 1.458576429404901e-07, "loss": 0.2819, "step": 1900 }, { "epoch": 2.9937007874015746, "grad_norm": 0.1895981360778471, "learning_rate": 1.1668611435239206e-07, "loss": 0.2828, "step": 1901 }, { "epoch": 2.995275590551181, "grad_norm": 0.18679747448500417, "learning_rate": 8.751458576429406e-08, "loss": 0.2761, "step": 1902 }, { "epoch": 2.9968503937007873, "grad_norm": 0.1926344989112511, "learning_rate": 5.834305717619603e-08, "loss": 0.2794, "step": 1903 }, { "epoch": 2.998425196850394, "grad_norm": 0.200867475710644, "learning_rate": 2.9171528588098016e-08, "loss": 0.2969, "step": 1904 }, { "epoch": 3.0, "grad_norm": 0.18404179137710658, "learning_rate": 0.0, "loss": 0.2638, "step": 1905 }, { "epoch": 3.0, "step": 1905, "total_flos": 1.6283741818737132e+18, "train_loss": 0.4442822661299718, "train_runtime": 111241.4508, "train_samples_per_second": 0.274, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 1905, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6283741818737132e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }